diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 438ca139757ad..18d90346d1d88 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1534,20 +1534,6 @@ static unsigned getMovOpc(bool IsScalar) { return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } -static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { - MI.setDesc(NewDesc); - - // Remove any leftover implicit operands from mutating the instruction. e.g. - // if we replace an s_and_b32 with a copy, we don't need the implicit scc def - // anymore. - const MCInstrDesc &Desc = MI.getDesc(); - unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + - Desc.implicit_defs().size(); - - for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) - MI.removeOperand(I); -} - std::optional SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const { if (Op.isImm()) @@ -1586,7 +1572,8 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { Opc == AMDGPU::S_NOT_B32) && Src0Imm) { MI->getOperand(1).ChangeToImmediate(~*Src0Imm); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1614,7 +1601,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { // instruction. MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); + TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR))); return true; } @@ -1634,11 +1621,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = or x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else if (Src1Val == -1) { // y = or x, -1 => y = v_mov_b32 -1 MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); } else return false; @@ -1650,11 +1638,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 MI->removeOperand(Src0Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); } else if (Src1Val == -1) { // y = and x, -1 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else return false; @@ -1666,7 +1655,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = xor x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); return true; } } @@ -1712,7 +1701,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MI.removeOperand(Src1ModIdx); if (Src0ModIdx != -1) MI.removeOperand(Src0ModIdx); - mutateCopyOp(MI, NewDesc); + TII->mutateAndCleanupImplicit(MI, NewDesc); LLVM_DEBUG(dbgs() << MI); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f5b52425e7841..7cb7f47ddb220 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3460,6 +3460,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const { + MI.setDesc(NewDesc); + + // Remove any leftover implicit operands from mutating the instruction. e.g. + // if we replace an s_and_b32 with a copy, we don't need the implicit scc def + // anymore. + const MCInstrDesc &Desc = MI.getDesc(); + unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + + Desc.implicit_defs().size(); + + for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) + MI.removeOperand(I); +} + std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, unsigned SubRegIndex) { switch (SubRegIndex) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b12d9525a7605..c66985a19685b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -425,6 +425,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void removeModOperands(MachineInstr &MI) const; + void mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const; + /// Return the extracted immediate value in a subregister use from a constant /// materialized in a super register. /// diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 0d81cb935069c..1b78f67e76d07 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -586,7 +586,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { const bool IsUndef = SrcReg->isUndef(); const bool IsKill = SrcReg->isKill(); - MI.setDesc(TII->get(Opc)); + TII->mutateAndCleanupImplicit(MI, TII->get(Opc)); if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 8e92dc80fe1d6..8879ef5c8265d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -169656,8 +169656,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s41, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s41 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s41, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s41, 22 ; VI-NEXT: s_and_b64 s[56:57], vcc, exec ; VI-NEXT: s_cselect_b32 s41, s41, s47 ; VI-NEXT: s_lshr_b32 s47, s41, 16 @@ -169668,8 +169668,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s41, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s41 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s41, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s41, 22 ; VI-NEXT: s_and_b64 s[56:57], vcc, exec ; VI-NEXT: s_cselect_b32 s56, s41, s47 ; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 @@ -169690,8 +169690,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s56, s47, 0x10010 ; VI-NEXT: s_add_i32 s56, s56, s47 ; VI-NEXT: s_add_i32 s58, s56, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[56:57], vcc, exec ; VI-NEXT: s_cselect_b32 s56, s47, s58 ; VI-NEXT: s_and_b32 s17, s17, 0xffff0000 @@ -169700,8 +169700,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s17, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s17 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: s_and_b64 s[58:59], vcc, exec ; VI-NEXT: s_cselect_b32 s17, s17, s47 ; VI-NEXT: s_lshr_b32 s57, s17, 16 @@ -169712,8 +169712,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s17 ; VI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: s_and_b64 s[58:59], vcc, exec ; VI-NEXT: s_cselect_b32 s58, s17, s47 ; VI-NEXT: s_and_b32 s16, s16, 0xffff0000 @@ -169734,8 +169734,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[16:17], s[58:59], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[58:59], vcc, exec ; VI-NEXT: s_cselect_b32 s58, s47, s57 ; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 @@ -169744,8 +169744,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s15, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s15 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: s_and_b64 s[60:61], vcc, exec ; VI-NEXT: s_cselect_b32 s15, s15, s47 ; VI-NEXT: s_lshr_b32 s59, s15, 16 @@ -169756,8 +169756,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s15 ; VI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: s_and_b64 s[60:61], vcc, exec ; VI-NEXT: s_cselect_b32 s60, s15, s47 ; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 @@ -169778,8 +169778,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[14:15], s[60:61], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[60:61], vcc, exec ; VI-NEXT: s_cselect_b32 s60, s47, s57 ; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 @@ -169788,8 +169788,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s13, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s13 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: s_and_b64 s[62:63], vcc, exec ; VI-NEXT: s_cselect_b32 s13, s13, s47 ; VI-NEXT: s_lshr_b32 s61, s13, 16 @@ -169800,8 +169800,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s13 ; VI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: s_and_b64 s[62:63], vcc, exec ; VI-NEXT: s_cselect_b32 s62, s13, s47 ; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 @@ -169822,8 +169822,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[12:13], s[62:63], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[62:63], vcc, exec ; VI-NEXT: s_cselect_b32 s62, s47, s57 ; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 @@ -169832,8 +169832,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s11, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s11 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: s_and_b64 s[72:73], vcc, exec ; VI-NEXT: s_cselect_b32 s11, s11, s47 ; VI-NEXT: s_lshr_b32 s63, s11, 16 @@ -169844,8 +169844,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s11 ; VI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: s_and_b64 s[72:73], vcc, exec ; VI-NEXT: s_cselect_b32 s72, s11, s47 ; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 @@ -169866,8 +169866,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[10:11], s[72:73], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[72:73], vcc, exec ; VI-NEXT: s_cselect_b32 s72, s47, s57 ; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 @@ -169876,8 +169876,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s9, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s9 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: s_and_b64 s[74:75], vcc, exec ; VI-NEXT: s_cselect_b32 s9, s9, s47 ; VI-NEXT: s_lshr_b32 s73, s9, 16 @@ -169888,8 +169888,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s9 ; VI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: s_and_b64 s[74:75], vcc, exec ; VI-NEXT: s_cselect_b32 s74, s9, s47 ; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 @@ -169910,8 +169910,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[8:9], s[74:75], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[74:75], vcc, exec ; VI-NEXT: s_cselect_b32 s74, s47, s57 ; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 @@ -169920,8 +169920,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s7, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s7 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: s_and_b64 s[76:77], vcc, exec ; VI-NEXT: s_cselect_b32 s7, s7, s47 ; VI-NEXT: s_lshr_b32 s75, s7, 16 @@ -169932,8 +169932,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s7 ; VI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: s_and_b64 s[76:77], vcc, exec ; VI-NEXT: s_cselect_b32 s76, s7, s47 ; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 @@ -169954,8 +169954,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[6:7], s[76:77], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[76:77], vcc, exec ; VI-NEXT: s_cselect_b32 s76, s47, s57 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 @@ -169964,8 +169964,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s5, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s5 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: s_and_b64 s[78:79], vcc, exec ; VI-NEXT: s_cselect_b32 s5, s5, s47 ; VI-NEXT: s_lshr_b32 s77, s5, 16 @@ -169976,8 +169976,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s5 ; VI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: s_and_b64 s[78:79], vcc, exec ; VI-NEXT: s_cselect_b32 s78, s5, s47 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 @@ -169998,8 +169998,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[4:5], s[78:79], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[78:79], vcc, exec ; VI-NEXT: s_cselect_b32 s78, s47, s57 ; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 @@ -170008,8 +170008,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s45, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s45 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s45, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s45, 22 ; VI-NEXT: s_and_b64 s[88:89], vcc, exec ; VI-NEXT: s_cselect_b32 s45, s45, s47 ; VI-NEXT: s_lshr_b32 s79, s45, 16 @@ -170020,8 +170020,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s45 ; VI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s45, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s45, 22 ; VI-NEXT: s_and_b64 s[88:89], vcc, exec ; VI-NEXT: s_cselect_b32 s88, s45, s47 ; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 @@ -170042,8 +170042,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[44:45], s[88:89], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[88:89], vcc, exec ; VI-NEXT: s_cselect_b32 s88, s47, s57 ; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 @@ -170052,8 +170052,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s43, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s43 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s43, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s43, 22 ; VI-NEXT: s_and_b64 s[90:91], vcc, exec ; VI-NEXT: s_cselect_b32 s43, s43, s47 ; VI-NEXT: s_lshr_b32 s89, s43, 16 @@ -170064,8 +170064,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s43 ; VI-NEXT: s_lshr_b64 s[88:89], s[88:89], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s43, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s43, 22 ; VI-NEXT: s_and_b64 s[90:91], vcc, exec ; VI-NEXT: s_cselect_b32 s90, s43, s47 ; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 @@ -170086,8 +170086,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[42:43], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 s[90:91], vcc, exec ; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s29, s29, 0xffff0000 @@ -170096,8 +170096,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s29, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s29 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s29, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s29, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s29, s29, s47 ; VI-NEXT: s_lshr_b32 s91, s29, 16 @@ -170108,8 +170108,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s29 ; VI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s29, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s29, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s30, s29, s47 ; VI-NEXT: s_and_b32 s28, s28, 0xffff0000 @@ -170130,8 +170130,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[28:29], s[30:31], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s30, s47, s57 ; VI-NEXT: s_and_b32 s27, s27, 0xffff0000 @@ -170140,8 +170140,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s27, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s27 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s27, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s27, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s27, s27, s47 ; VI-NEXT: s_lshr_b32 s31, s27, 16 @@ -170152,8 +170152,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s27 ; VI-NEXT: s_lshr_b64 s[30:31], s[30:31], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s27, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s27, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s27, s47 ; VI-NEXT: s_and_b32 s26, s26, 0xffff0000 @@ -170174,8 +170174,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[26:27], s[34:35], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s47, s57 ; VI-NEXT: s_and_b32 s25, s25, 0xffff0000 @@ -170184,8 +170184,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s25, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s25 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s25, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s25, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s25, s25, s47 ; VI-NEXT: s_lshr_b32 s35, s25, 16 @@ -170196,8 +170196,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s25 ; VI-NEXT: s_lshr_b64 s[36:37], s[34:35], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s25, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s25, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s25, s47 ; VI-NEXT: s_and_b32 s24, s24, 0xffff0000 @@ -170218,8 +170218,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[24:25], s[34:35], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s47, s57 ; VI-NEXT: s_and_b32 s23, s23, 0xffff0000 @@ -170228,8 +170228,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s23, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s23 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s23, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s23, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s23, s23, s47 ; VI-NEXT: s_lshr_b32 s35, s23, 16 @@ -170240,8 +170240,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s23 ; VI-NEXT: s_lshr_b64 s[48:49], s[34:35], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s23, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s23, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s23, s47 ; VI-NEXT: s_and_b32 s22, s22, 0xffff0000 @@ -170262,8 +170262,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[22:23], s[34:35], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s47, s57 ; VI-NEXT: s_and_b32 s21, s21, 0xffff0000 @@ -170272,8 +170272,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s21, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s21 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s21, s21, s47 ; VI-NEXT: s_lshr_b32 s35, s21, 16 @@ -170284,8 +170284,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s21 ; VI-NEXT: s_lshr_b64 s[52:53], s[34:35], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s21, s47 ; VI-NEXT: s_and_b32 s20, s20, 0xffff0000 @@ -170306,8 +170306,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s57, s57, s47 ; VI-NEXT: s_lshr_b64 s[20:21], s[34:35], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff -; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s47, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s47, s57 ; VI-NEXT: s_and_b32 s19, s19, 0xffff0000 @@ -170316,8 +170316,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bfe_u32 s47, s19, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s19 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s19, s19, s47 ; VI-NEXT: s_lshr_b32 s35, s19, 16 @@ -170328,8 +170328,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_add_i32 s47, s47, s19 ; VI-NEXT: s_lshr_b64 s[64:65], s[34:35], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff -; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s34, s19, s47 ; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 @@ -171476,8 +171476,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s41, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s41 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s41, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s41, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s41, s41, s56 ; GFX9-NEXT: s_and_b32 s46, s40, 0xffff0000 @@ -171500,8 +171500,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s40, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s40 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s40, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s40, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s40, s40, s57 ; GFX9-NEXT: s_and_b32 s46, s17, 0xffff0000 @@ -171523,8 +171523,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s17, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s17 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s17, s17, s56 ; GFX9-NEXT: s_and_b32 s46, s16, 0xffff0000 @@ -171547,8 +171547,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s16, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s16 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s16, s16, s57 ; GFX9-NEXT: s_and_b32 s46, s15, 0xffff0000 @@ -171570,8 +171570,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s15, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s15 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s15, s15, s56 ; GFX9-NEXT: s_and_b32 s46, s14, 0xffff0000 @@ -171594,8 +171594,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s14, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s14 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s14, s14, s57 ; GFX9-NEXT: s_and_b32 s46, s13, 0xffff0000 @@ -171617,8 +171617,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s13, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s13 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s13, s13, s56 ; GFX9-NEXT: s_and_b32 s46, s12, 0xffff0000 @@ -171641,8 +171641,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s12, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s12 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s12, s12, s57 ; GFX9-NEXT: s_and_b32 s46, s11, 0xffff0000 @@ -171664,8 +171664,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s11, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s11 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s11, s11, s56 ; GFX9-NEXT: s_and_b32 s46, s10, 0xffff0000 @@ -171688,8 +171688,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s10, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s10 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s10, s10, s57 ; GFX9-NEXT: s_and_b32 s46, s9, 0xffff0000 @@ -171711,8 +171711,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s9, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s9 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s9, s9, s56 ; GFX9-NEXT: s_and_b32 s46, s8, 0xffff0000 @@ -171735,8 +171735,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s8, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s8 ; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s8, s8, s57 ; GFX9-NEXT: s_and_b32 s46, s7, 0xffff0000 @@ -171758,8 +171758,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s7, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s7 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s7, s7, s56 ; GFX9-NEXT: s_and_b32 s46, s6, 0xffff0000 @@ -171780,8 +171780,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s6 ; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s6, s58 ; GFX9-NEXT: s_and_b32 s46, s5, 0xffff0000 @@ -171802,8 +171802,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s46, s5, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s5 ; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec ; GFX9-NEXT: s_cselect_b32 s5, s5, s58 ; GFX9-NEXT: s_and_b32 s46, s4, 0xffff0000 @@ -171814,10 +171814,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_writelane_b32 v21, s57, 6 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s46, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_writelane_b32 v21, s59, 7 ; GFX9-NEXT: s_pack_ll_b32_b16 s47, s5, s59 +; GFX9-NEXT: s_bitset1_b32 s46, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s46, s46, s72 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 @@ -171827,8 +171827,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_add_i32 s58, s58, s4 ; GFX9-NEXT: s_lshr_b32 s46, s46, 16 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s4, s4, s72 ; GFX9-NEXT: s_and_b32 s58, s45, 0xffff0000 @@ -171849,8 +171849,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s45, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s45 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s45, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s45, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s45, s45, s72 ; GFX9-NEXT: s_and_b32 s58, s44, 0xffff0000 @@ -171873,8 +171873,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s44, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s44 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s44, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s44, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s44, s44, s73 ; GFX9-NEXT: s_and_b32 s58, s43, 0xffff0000 @@ -171896,8 +171896,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s43, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s43 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s43, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s43, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s43, s43, s72 ; GFX9-NEXT: s_and_b32 s58, s42, 0xffff0000 @@ -171920,8 +171920,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s42, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s42 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s42, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s42, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s42, s42, s73 ; GFX9-NEXT: s_and_b32 s58, s29, 0xffff0000 @@ -171943,8 +171943,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s29, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s29 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s29, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s29, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s29, s29, s72 ; GFX9-NEXT: s_and_b32 s58, s28, 0xffff0000 @@ -171967,8 +171967,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s28, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s28 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s28, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s28, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s28, s28, s73 ; GFX9-NEXT: s_and_b32 s58, s27, 0xffff0000 @@ -171990,8 +171990,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s27, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s27 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s27, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s27, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s27, s27, s72 ; GFX9-NEXT: s_and_b32 s58, s26, 0xffff0000 @@ -172014,8 +172014,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s26, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s26 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s26, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s26, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s26, s26, s73 ; GFX9-NEXT: s_and_b32 s58, s25, 0xffff0000 @@ -172037,8 +172037,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s25, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s25 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s25, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s25, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s25, s25, s72 ; GFX9-NEXT: s_and_b32 s58, s24, 0xffff0000 @@ -172061,8 +172061,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s24, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s24 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s24, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s24, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s24, s24, s73 ; GFX9-NEXT: s_and_b32 s58, s23, 0xffff0000 @@ -172084,8 +172084,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s23, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s23 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s23, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s23, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s23, s23, s72 ; GFX9-NEXT: s_and_b32 s58, s22, 0xffff0000 @@ -172108,8 +172108,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s22, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s22 ; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s22, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s22, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s22, s22, s73 ; GFX9-NEXT: s_and_b32 s58, s21, 0xffff0000 @@ -172131,8 +172131,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s21, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s21 ; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s21, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s21, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s21, s21, s72 ; GFX9-NEXT: s_and_b32 s58, s20, 0xffff0000 @@ -172153,8 +172153,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s20, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s20 ; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s20, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s20, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s20, s20, s38 ; GFX9-NEXT: s_and_b32 s58, s19, 0xffff0000 @@ -172175,8 +172175,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s19, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s19 ; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s19, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s19, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_cselect_b32 s19, s19, s38 ; GFX9-NEXT: s_and_b32 s58, s18, 0xffff0000 @@ -172197,8 +172197,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s58, s18, 0x10010 ; GFX9-NEXT: s_add_i32 s58, s58, s18 ; GFX9-NEXT: s_add_i32 s39, s58, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s59, s19, s69 ; GFX9-NEXT: s_cselect_b32 s18, s18, s39 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 4cf6ed8855818..a0c596ff9d5de 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -89148,8 +89148,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s19 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 s[22:23], vcc, exec ; VI-NEXT: s_cselect_b32 s19, s19, s21 ; VI-NEXT: s_lshr_b32 s21, s19, 16 @@ -89160,8 +89160,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s19 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 s[22:23], vcc, exec ; VI-NEXT: s_cselect_b32 s22, s19, s21 ; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 @@ -89182,8 +89182,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s22, s21, 0x10010 ; VI-NEXT: s_add_i32 s22, s22, s21 ; VI-NEXT: s_add_i32 s24, s22, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[22:23], vcc, exec ; VI-NEXT: s_cselect_b32 s22, s21, s24 ; VI-NEXT: s_and_b32 s17, s17, 0xffff0000 @@ -89192,8 +89192,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s17, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s17 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: s_and_b64 s[24:25], vcc, exec ; VI-NEXT: s_cselect_b32 s17, s17, s21 ; VI-NEXT: s_lshr_b32 s23, s17, 16 @@ -89204,8 +89204,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s17 ; VI-NEXT: s_lshr_b64 s[22:23], s[22:23], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: s_and_b64 s[24:25], vcc, exec ; VI-NEXT: s_cselect_b32 s24, s17, s21 ; VI-NEXT: s_and_b32 s16, s16, 0xffff0000 @@ -89226,8 +89226,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[16:17], s[24:25], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[24:25], vcc, exec ; VI-NEXT: s_cselect_b32 s24, s21, s23 ; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 @@ -89236,8 +89236,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s15, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s15 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: s_cselect_b32 s15, s15, s21 ; VI-NEXT: s_lshr_b32 s25, s15, 16 @@ -89248,8 +89248,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s15 ; VI-NEXT: s_lshr_b64 s[24:25], s[24:25], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: s_cselect_b32 s26, s15, s21 ; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 @@ -89270,8 +89270,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: s_cselect_b32 s26, s21, s23 ; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 @@ -89280,8 +89280,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s13, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s13 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: s_and_b64 s[28:29], vcc, exec ; VI-NEXT: s_cselect_b32 s13, s13, s21 ; VI-NEXT: s_lshr_b32 s27, s13, 16 @@ -89292,8 +89292,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s13 ; VI-NEXT: s_lshr_b64 s[26:27], s[26:27], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: s_and_b64 s[28:29], vcc, exec ; VI-NEXT: s_cselect_b32 s28, s13, s21 ; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 @@ -89314,8 +89314,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[28:29], vcc, exec ; VI-NEXT: s_cselect_b32 s28, s21, s23 ; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 @@ -89324,8 +89324,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s11, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s11 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec ; VI-NEXT: s_cselect_b32 s11, s11, s21 ; VI-NEXT: s_lshr_b32 s29, s11, 16 @@ -89336,8 +89336,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s11 ; VI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec ; VI-NEXT: s_cselect_b32 s40, s11, s21 ; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 @@ -89358,8 +89358,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec ; VI-NEXT: s_cselect_b32 s40, s21, s23 ; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 @@ -89368,8 +89368,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s9, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s9 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s9, s9, s21 ; VI-NEXT: s_lshr_b32 s41, s9, 16 @@ -89380,8 +89380,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s9 ; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s42, s9, s21 ; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 @@ -89402,8 +89402,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[8:9], s[42:43], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s42, s21, s23 ; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 @@ -89412,8 +89412,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s7, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s7 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: s_and_b64 s[44:45], vcc, exec ; VI-NEXT: s_cselect_b32 s7, s7, s21 ; VI-NEXT: s_lshr_b32 s43, s7, 16 @@ -89424,8 +89424,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s7 ; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s42, s7, s21 ; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 @@ -89446,8 +89446,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s23, s23, s21 ; VI-NEXT: s_lshr_b64 s[6:7], s[42:43], 16 ; VI-NEXT: s_addk_i32 s23, 0x7fff -; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s42, s21, s23 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 @@ -89456,8 +89456,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s5 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: s_and_b64 s[44:45], vcc, exec ; VI-NEXT: s_cselect_b32 s5, s5, s21 ; VI-NEXT: s_lshr_b32 s43, s5, 16 @@ -89468,8 +89468,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_add_i32 s21, s21, s5 ; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff -; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec ; VI-NEXT: s_cselect_b32 s42, s5, s21 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 @@ -89948,8 +89948,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s19, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s19 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s19, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s19, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s19, s19, s22 ; GFX9-NEXT: s_and_b32 s20, s18, 0xffff0000 @@ -89970,8 +89970,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s18, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s18 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s18, s18, s23 ; GFX9-NEXT: s_and_b32 s20, s17, 0xffff0000 @@ -89993,8 +89993,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s17, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s17 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s17, s17, s22 ; GFX9-NEXT: s_and_b32 s20, s16, 0xffff0000 @@ -90015,8 +90015,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s16, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s16 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s16, s16, s23 ; GFX9-NEXT: s_and_b32 s20, s15, 0xffff0000 @@ -90038,8 +90038,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s15, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s15 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s15, s15, s22 ; GFX9-NEXT: s_and_b32 s20, s14, 0xffff0000 @@ -90060,8 +90060,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s14, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s14 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s14, s14, s23 ; GFX9-NEXT: s_and_b32 s20, s13, 0xffff0000 @@ -90083,8 +90083,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s13, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s13 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s13, s13, s22 ; GFX9-NEXT: s_and_b32 s20, s12, 0xffff0000 @@ -90105,8 +90105,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s12, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s12 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s12, s12, s23 ; GFX9-NEXT: s_and_b32 s20, s11, 0xffff0000 @@ -90128,8 +90128,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s11, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s11 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s11, s11, s22 ; GFX9-NEXT: s_and_b32 s20, s10, 0xffff0000 @@ -90150,8 +90150,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s10, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s10 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s10, s10, s23 ; GFX9-NEXT: s_and_b32 s20, s9, 0xffff0000 @@ -90173,8 +90173,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s9, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s9 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s9, s9, s22 ; GFX9-NEXT: s_and_b32 s20, s8, 0xffff0000 @@ -90195,8 +90195,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s8, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s8 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s8, s8, s23 ; GFX9-NEXT: s_and_b32 s20, s7, 0xffff0000 @@ -90218,8 +90218,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s7, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s7 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s7, s7, s22 ; GFX9-NEXT: s_and_b32 s20, s6, 0xffff0000 @@ -90240,8 +90240,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s6 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s6, s23 ; GFX9-NEXT: s_and_b32 s20, s5, 0xffff0000 @@ -90263,8 +90263,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s5, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s5 ; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s5, s5, s22 ; GFX9-NEXT: s_and_b32 s20, s4, 0xffff0000 @@ -90285,8 +90285,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_bfe_u32 s20, s4, 0x10010 ; GFX9-NEXT: s_add_i32 s20, s20, s4 ; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: s_cselect_b32 s4, s4, s23 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-insts-scalar-bit-ops.mir b/llvm/test/CodeGen/AMDGPU/shrink-insts-scalar-bit-ops.mir index a8deda7ad9507..22aa2bea44b37 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-insts-scalar-bit-ops.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-insts-scalar-bit-ops.mir @@ -8,7 +8,7 @@ tracksRegLiveness: true body: | bb.0: ; CHECK-LABEL: name: undef_and_operand_to_bitset0 - ; CHECK: renamable $sgpr4 = S_BITSET0_B32 31, undef $sgpr4, implicit-def dead $scc + ; CHECK: renamable $sgpr4 = S_BITSET0_B32 31, undef $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $sgpr4 renamable $sgpr4 = S_AND_B32 undef renamable $sgpr4, 2147483647, implicit-def dead $scc S_ENDPGM 0, implicit $sgpr4 @@ -50,7 +50,7 @@ body: | ; CHECK-LABEL: name: kill_and_operand_to_bitset0 ; CHECK: liveins: $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr4 = S_BITSET0_B32 31, killed $sgpr4, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr4 = S_BITSET0_B32 31, killed $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $sgpr4 renamable $sgpr4 = S_AND_B32 killed renamable $sgpr4, 2147483647, implicit-def dead $scc S_ENDPGM 0, implicit $sgpr4