diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 4ca2de216f487..463b8c40350b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -670,6 +670,113 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + + LLT Src0Ty = MRI->getType(Src0); + LLT Src1Ty = MRI->getType(Src1); + + const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); + const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI); + const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; + + Register ShiftSrc0; + Register ShiftSrc1; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *BB = MI.getParent(); + + // VGPR case + if (IsVector) { + // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters + if (Src0Bank->getID() == AMDGPU::VGPRRegBankID && + Src1Bank->getID() == AMDGPU::VGPRRegBankID && + Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) { + BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst) + .addReg(Src0) + .addImm(AMDGPU::lo16) + .addReg(Src1) + .addImm(AMDGPU::hi16); + + if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + + MI.eraseFromParent(); + return true; + } + + // Otherwise, use V_LSHL_OR_B32_e64 + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) + .addImm(0xFFFF) + .addReg(Src0); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) + .addReg(Src1) + .addImm(16) + .addReg(TmpReg); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MI.eraseFromParent(); + return true; + } + + // SGPR case -> S_PACK_*_B32_B16 + // With multiple uses of the shift, this will duplicate the shift and + // increase register pressure. + // + // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) + // => (S_PACK_HH_B32_B16 $src0, $src1) + // (merge (lshr_oneuse SReg_32:$src0, 16), $src1) + // => (S_PACK_HL_B32_B16 $src0, $src1) + // (merge $src0, (lshr_oneuse SReg_32:$src1, 16)) + // => (S_PACK_LH_B32_B16 $src0, $src1) + // (merge $src0, $src1) + // => (S_PACK_LL_B32_B16 $src0, $src1) + + bool Shift0 = mi_match( + Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); + + bool Shift1 = mi_match( + Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); + + unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; + if (Shift0 && Shift1) { + Opc = AMDGPU::S_PACK_HH_B32_B16; + MI.getOperand(1).setReg(ShiftSrc0); + MI.getOperand(2).setReg(ShiftSrc1); + } else if (Shift1) { + Opc = AMDGPU::S_PACK_LH_B32_B16; + MI.getOperand(2).setReg(ShiftSrc1); + } else if (Shift0) { + auto ConstSrc1 = + getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); + if (ConstSrc1 && ConstSrc1->Value == 0) { + // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 + auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) + .addReg(ShiftSrc0) + .addImm(16) + .setOperandDead(3); // Dead scc + + MI.eraseFromParent(); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; + } + if (STI.hasSPackHL()) { + Opc = AMDGPU::S_PACK_HL_B32_B16; + MI.getOperand(1).setReg(ShiftSrc0); + } + } + + MI.setDesc(TII.get(Opc)); + constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + return true; +} + bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); Register DstReg = MI.getOperand(0).getReg(); @@ -677,8 +784,14 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); const unsigned SrcSize = SrcTy.getSizeInBits(); - if (SrcSize < 32) + if (SrcSize < 32) { + // Handle s32 <- G_MERGE_VALUES s16, s16 + if (SrcSize == 16 && DstTy.getSizeInBits() == 32 && + MI.getNumOperands() == 3) { + return selectS16MergeToS32(MI); + } return selectImpl(MI, *CoverageInfo); + } const DebugLoc &DL = MI.getDebugLoc(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); @@ -839,76 +952,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { RBI.constrainGenericRegister(Src0, RC, *MRI); } - // TODO: Can be improved? - if (IsVector) { - Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) - .addImm(0xFFFF) - .addReg(Src0); - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - - MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) - .addReg(Src1) - .addImm(16) - .addReg(TmpReg); - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - - MI.eraseFromParent(); - return true; - } - - Register ShiftSrc0; - Register ShiftSrc1; - - // With multiple uses of the shift, this will duplicate the shift and - // increase register pressure. - // - // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) - // => (S_PACK_HH_B32_B16 $src0, $src1) - // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) - // => (S_PACK_HL_B32_B16 $src0, $src1) - // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) - // => (S_PACK_LH_B32_B16 $src0, $src1) - // (build_vector $src0, $src1) - // => (S_PACK_LL_B32_B16 $src0, $src1) - - bool Shift0 = mi_match( - Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); - - bool Shift1 = mi_match( - Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); - - unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; - if (Shift0 && Shift1) { - Opc = AMDGPU::S_PACK_HH_B32_B16; - MI.getOperand(1).setReg(ShiftSrc0); - MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift1) { - Opc = AMDGPU::S_PACK_LH_B32_B16; - MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0) { - auto ConstSrc1 = - getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); - if (ConstSrc1 && ConstSrc1->Value == 0) { - // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 - auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) - .addReg(ShiftSrc0) - .addImm(16) - .setOperandDead(3); // Dead scc - - MI.eraseFromParent(); - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - return true; - } - if (STI.hasSPackHL()) { - Opc = AMDGPU::S_PACK_HL_B32_B16; - MI.getOperand(1).setReg(ShiftSrc0); - } - } - - MI.setDesc(TII.get(Opc)); - constrainSelectedInstRegOperands(MI, TII, TRI, RBI); - return true; + return selectS16MergeToS32(MI); } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 7a7a688262934..d3c0703e0b07f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -461,6 +461,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { /// Match an any extend from a 32-bit value to 64-bit. Register matchAnyExtendFromS32(Register Reg) const; + bool selectS16MergeToS32(MachineInstr &MI) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 027ea688b3102..61a4a651b60ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -699,13 +699,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}}) .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}}); - addRulesForGOpcs({G_BUILD_VECTOR}) + addRulesForGOpcs({G_BUILD_VECTOR, G_MERGE_VALUES}) .Any({{UniBRC, S16}, {{}, {}, VerifyAllSgpr}}) .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}}) .Any({{DivBRC, S16}, {{}, {}, ApplyAllVgpr}}) .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}}); - addRulesForGOpcs({G_MERGE_VALUES, G_CONCAT_VECTORS}) + addRulesForGOpcs({G_CONCAT_VECTORS}) .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}}) .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 22daebe753b1c..cbcf346e8bb62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s ; FIXME: regbankcombiner regression, related to: @@ -459,12 +459,26 @@ define i16 @v_andn2_i16(i16 %src0, i16 %src1) { ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_andn2_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_andn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_andn2_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v1.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_andn2_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and @@ -478,12 +492,26 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) { ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: v_andn2_i16_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: v_andn2_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: v_andn2_i16_sv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, -1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_andn2_i16_sv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -499,12 +527,26 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) { ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: v_andn2_i16_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: v_andn2_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s0, s2, -1 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: v_andn2_i16_vs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s2, -1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, s0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_andn2_i16_vs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s2, -1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -692,17 +734,38 @@ define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_andn2_v3i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_andn2_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_andn2_v3i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_andn2_v3i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %and = and <3 x i16> %src0, %not.src1 %cast = bitcast <3 x i16> %and to i48 @@ -745,17 +808,38 @@ define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inr ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_andn2_v3i16_commute: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_andn2_v3i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_andn2_v3i16_commute: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_andn2_v3i16_commute: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %and = and <3 x i16> %not.src1, %src0 %cast = bitcast <3 x i16> %and to i48 @@ -808,22 +892,51 @@ define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 ; GFX9-NEXT: s_and_b32 s3, s5, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_andn2_v3i16_multi_use: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3 -; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_andn2_v3i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s3, s5, 0xffff +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_andn2_v3i16_multi_use: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s5 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_andn2_v3i16_multi_use: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %and = and <3 x i16> %src0, %not.src1 %cast.0 = bitcast <3 x i16> %and to i48 @@ -1127,5 +1240,3 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) { %and = and <4 x i16> %src0, %not.src1 ret <4 x i16> %and } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-FAKE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index d7887507160cd..6b1b797e3cfec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -o - %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { @@ -318,45 +318,85 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i7: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: s_mul_i32 s1, s0, -7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX11-NEXT: s_add_i32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v5, v2, s0 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v2, -7, v3 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0x7f, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 1, v0.h +; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-TRUE16-NEXT: s_mul_i32 s1, s0, -7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v5, v2, s0 +; GFX11-TRUE16-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -7, v3 +; GFX11-TRUE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, -7, v2 +; GFX11-TRUE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v1.l, 6, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7f, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7f, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-FAKE16-NEXT: s_mul_i32 s1, s0, -7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v5, v2, s0 +; GFX11-FAKE16-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, -7, v3 +; GFX11-FAKE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, -7, v2 +; GFX11-FAKE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0x7f, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -475,21 +515,37 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, v2.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 1, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -578,15 +634,25 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i8_4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i8_4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 4, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 4, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i8_4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -675,15 +741,25 @@ define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i8_5: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b16 v1, 3, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i8_5: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 5, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 3, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i8_5: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 5, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -932,37 +1008,69 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 7, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshrrev_b16 v3, 1, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_lshlrev_b16 v4, v4, v5 -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b16 v3, v6, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_xor_b16 v2.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, v4.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v4.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 7 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, 1, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 1, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, v2.h, 7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, v3.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -1433,61 +1541,117 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v8, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, v10, v5 -; GFX11-NEXT: v_lshrrev_b16 v1, v13, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, v11, v0 -; GFX11-NEXT: v_lshrrev_b16 v2, v2, v12 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_xor_b16 v3.h, v4.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_xor_b16 v5.h, v10.l, -1 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v3.l, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v3.h, 7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_xor_b16 v5.l, v9.l, -1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v2.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_xor_b16 v2.l, v2.l, -1 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v3.l, v3.h, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v9.l, 7 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v4.h, 1, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v5.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v10.l, 7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 25, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v5.h, 7 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v2.h, 1, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 7 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, v3.h, v7.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v3.l, v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, v6.l, v8.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v13, -1, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v6, 1, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, v8, v3 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v13, -1, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v12, 1, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v5, v10, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v2, v2, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -2335,133 +2499,242 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshl_v2i24: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_lshr_b32 s6, s2, 8 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s7, s7, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_or_b32 s1, s1, s7 -; GFX11-NEXT: s_lshr_b32 s7, s2, 16 -; GFX11-NEXT: s_lshr_b32 s8, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s9, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: s_or_b32 s3, s8, s3 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX11-NEXT: s_or_b32 s2, s2, s6 -; GFX11-NEXT: s_lshr_b32 s6, s4, 8 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_lshl_b32 s7, s7, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s3, s3, s7 -; GFX11-NEXT: s_lshr_b32 s7, s4, 16 -; GFX11-NEXT: s_lshr_b32 s8, s4, 24 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s9, s5, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: v_readfirstlane_b32 s7, v0 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_mul_i32 s6, s7, 0xffffffe8 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX11-NEXT: s_or_b32 s5, s8, s5 -; GFX11-NEXT: s_add_i32 s7, s7, s6 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_mul_hi_u32 s6, s4, s7 -; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-NEXT: s_mulk_i32 s6, 0xffe8 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_add_i32 s4, s4, s6 -; GFX11-NEXT: s_or_b32 s5, s5, s8 -; GFX11-NEXT: s_cmp_ge_u32 s4, 24 -; GFX11-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-NEXT: s_sub_i32 s8, s4, 24 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-NEXT: s_cselect_b32 s4, s8, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s4, 24 -; GFX11-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-NEXT: s_sub_i32 s8, s4, 24 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-NEXT: s_cselect_b32 s4, s8, s4 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_sub_i32 s6, 23, s4 -; GFX11-NEXT: s_lshl_b32 s0, s0, s4 -; GFX11-NEXT: s_mul_hi_u32 s4, s5, s7 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_mulk_i32 s4, 0xffe8 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_add_i32 s5, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s5, 24 -; GFX11-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-NEXT: s_sub_i32 s4, s5, 24 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, s4, s5 -; GFX11-NEXT: s_cmp_ge_u32 s2, 24 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-NEXT: s_sub_i32 s5, s2, 24 -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s2, s5, s2 -; GFX11-NEXT: s_lshr_b32 s3, s3, 1 -; GFX11-NEXT: s_sub_i32 s4, 23, s2 -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 -; GFX11-NEXT: s_lshr_b32 s2, s3, s4 -; GFX11-NEXT: s_and_b32 s3, s0, 0xff -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_and_b32 s3, s1, 0xff -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_lshl_b32 s2, s3, 24 -; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_or_b32 s1, s1, s3 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshl_v2i24: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-TRUE16-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s8, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 24 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s3 +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s11, 0xff +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 24 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s10 +; GFX11-TRUE16-NEXT: s_mul_i32 s10, s13, 0xffffffe8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s10, s4, s13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_mulk_i32 s10, 0xffe8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s12, s5 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s14, 0xff +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s12, s4, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s12, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s8, s4, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-TRUE16-NEXT: s_sub_i32 s6, 23, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s4, s5, s13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, s6 +; GFX11-TRUE16-NEXT: s_mulk_i32 s4, 0xffe8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s5, 24 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s4, s5, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s2, 24 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s5, s2, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-TRUE16-NEXT: s_sub_i32 s4, 23, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s3, 24 +; GFX11-TRUE16-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-TRUE16-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshl_v2i24: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-FAKE16-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s8, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 24 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s4, 24 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s5, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s6 +; GFX11-FAKE16-NEXT: s_mul_i32 s6, s7, 0xffffffe8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s8, s5 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-FAKE16-NEXT: s_mulk_i32 s6, 0xffe8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s8 +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s8, s4, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s8, s4, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-FAKE16-NEXT: s_sub_i32 s6, 23, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s4, s5, s7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_mulk_i32 s4, 0xffe8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s5, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s4, s5, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s2, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s5, s2, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-FAKE16-NEXT: s_sub_i32 s4, 23, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s3, 24 +; GFX11-FAKE16-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-FAKE16-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> %amt = bitcast i48 %amt.arg to <2 x i24> @@ -2755,16 +3028,28 @@ define amdgpu_ps i32 @s_fshl_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshl_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshl_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_not_b32 s2, s2 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshl_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_not_b32 s1, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2855,15 +3140,25 @@ define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) { ; GCN-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2936,14 +3231,23 @@ define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) ; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i32_ssv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX11-NEXT: v_not_b32_e32 v0, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i32_ssv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s0, s1, 1 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v1, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i32_ssv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, s1, 1 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2985,14 +3289,24 @@ define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i32_svs: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i32_svs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_not_b32 s1, s1 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v0, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i32_svs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_not_b32 s1, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -3037,14 +3351,24 @@ define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i32_vss: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i32_vss: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_not_b32 s2, s2 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v0, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i32_vss: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_not_b32 s1, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -3103,19 +3427,33 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_v2i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, v1, v3, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_v2i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, v1, v3, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result } @@ -3189,23 +3527,41 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) { ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_v3i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v6, v6 -; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v7, v7 -; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_not_b32_e32 v8, v8 -; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_v3i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v6 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v4, v1, v4, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v7, v7 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v5, v2, v5, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v8, v8 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v6.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v4, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, v2, v5, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_v3i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v6 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v4, v1, v4, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v7, v7 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v5, v2, v5, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v8, v8 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) ret <3 x i32> %result } @@ -3295,27 +3651,49 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_v4i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v8, v8 -; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v9, v9 -; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_not_b32_e32 v10, v10 -; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_not_b32_e32 v11, v11 -; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_v4i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v8, v8 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v9 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v10 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v11 +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v4, v8.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v5, v9.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, v2, v6, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, v3, v7, v11.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_v4i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v8, v8 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v9 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v10, v10 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v11 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result } @@ -3516,19 +3894,33 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 15 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, 1, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -3566,14 +3958,23 @@ define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i16_4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, 12, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i16_4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 4, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 12, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i16_4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 12, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -3611,14 +4012,23 @@ define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshl_i16_5: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, 5, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, 11, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshl_i16_5: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 5, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 11, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshl_i16_5: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 5, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 11, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -3671,20 +4081,35 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i16_ssv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i16_ssv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v0.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 +; GFX11-TRUE16-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v0.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i16_ssv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3731,17 +4156,29 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i16_svs: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b16 v0, 1, v0 -; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 -; GFX11-NEXT: s_and_b32 s1, s1, 15 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, s1 -; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i16_svs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i16_svs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, s2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3791,18 +4228,31 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshl_i16_vss: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s2, s1, 15 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshl_i16_vss: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, s0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshl_i16_vss: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 15 +; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, s2, v0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -4535,52 +4985,94 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshl_v3i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s9, s2, 0xffff -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_and_b32 s6, s4, 0xf000f -; GFX11-NEXT: s_lshr_b32 s9, s9, 0x10001 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s9, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_lshl_b32 s6, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xffff -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_lshr_b32 s8, s4, 16 -; GFX11-NEXT: s_lshr_b32 s4, s7, s4 -; GFX11-NEXT: s_lshr_b32 s2, s2, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX11-NEXT: s_and_b32 s7, s3, 0xffff -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s5, 0xf000f -; GFX11-NEXT: s_lshr_b32 s7, s7, 0x10001 -; GFX11-NEXT: s_lshr_b32 s3, s3, 1 -; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s5 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s3 -; GFX11-NEXT: s_lshl_b32 s3, s5, s6 -; GFX11-NEXT: s_and_b32 s5, s2, 0xffff -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_or_b32 s0, s0, s3 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshl_v3i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_and_b32 s9, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 0x10001 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-TRUE16-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s7, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 0x10001 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 s5, 0xf000f, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s8, s3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshl_v3i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_and_b32 s9, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 0x10001 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s7, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s4, 0xf000f, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) %cast = bitcast <3 x i16> %result to i48 ret i48 %cast @@ -8125,5 +8617,3 @@ declare i128 @llvm.fshl.i128(i128, i128, i128) #0 declare <2 x i128> @llvm.fshl.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0 attributes #0 = { nounwind readnone speculatable willreturn } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-FAKE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index eea0f6065b4d8..cf58ba6658321 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -o - %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { @@ -320,44 +320,83 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i7: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: s_mul_i32 s1, s0, -7 -; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s1 -; GFX11-NEXT: v_mul_hi_u32 v5, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] -; GFX11-NEXT: v_add_nc_u32_e32 v2, -7, v3 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7f, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-TRUE16-NEXT: s_mul_i32 s1, s0, -7 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v5, v2, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -7, v3 +; GFX11-TRUE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, -7, v2 +; GFX11-TRUE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 6, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7f, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0x7f, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-FAKE16-NEXT: s_mul_i32 s1, s0, -7 +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v5, v2, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mad_u64_u32 v[3:4], null, v5, -7, v[2:3] +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, -7, v3 +; GFX11-FAKE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, -7, v2 +; GFX11-FAKE16-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0x7f, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -476,20 +515,35 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -578,15 +632,25 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i8_4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i8_4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 4, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 4, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i8_4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -675,15 +739,25 @@ define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i8_5: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b16 v1, 5, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i8_5: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 3, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 5, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i8_5: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 5, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -932,38 +1006,70 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v0, v7, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v3.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 1, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v3.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v2.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, v6, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -1443,62 +1549,119 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 -; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 -; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 -; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v11, v8 -; GFX11-NEXT: v_lshlrev_b16 v5, v7, v5 -; GFX11-NEXT: v_lshrrev_b16 v7, v13, v9 -; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, v7.l, -1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v7.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-TRUE16-NEXT: v_xor_b16 v4.l, v10.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v2.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_xor_b16 v3.h, v11.l, -1 +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 1, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, v10.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 1, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v3.h, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v11.l, 7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 7 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, v4.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v2.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, v3.h, v5.h +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v3.h, v6.l, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v2.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v12, -1, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v14, -1, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v7, -1, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 7, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v4, v12, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v6, v11, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v5, v7, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v7, v13, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v4, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -2356,133 +2519,242 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshr_v2i24: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: s_lshr_b32 s8, s2, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_and_b32 s8, s8, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 16 -; GFX11-NEXT: s_lshr_b32 s10, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, 8 -; GFX11-NEXT: s_lshr_b32 s11, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s9, s11, 0xff -; GFX11-NEXT: s_or_b32 s3, s10, s3 -; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_lshr_b32 s8, s4, 8 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_and_b32 s8, s8, 0xff -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s3, s3, s9 -; GFX11-NEXT: s_lshr_b32 s9, s4, 16 -; GFX11-NEXT: s_lshr_b32 s10, s4, 24 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, 8 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s8 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: v_readfirstlane_b32 s9, v0 -; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s8 -; GFX11-NEXT: s_mul_i32 s8, s9, 0xffffffe8 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_mul_hi_u32 s8, s9, s8 -; GFX11-NEXT: s_or_b32 s5, s10, s5 -; GFX11-NEXT: s_add_i32 s9, s9, s8 -; GFX11-NEXT: s_and_b32 s10, s11, 0xff -; GFX11-NEXT: s_mul_hi_u32 s8, s4, s9 -; GFX11-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX11-NEXT: s_mulk_i32 s8, 0xffe8 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-NEXT: s_add_i32 s4, s4, s8 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX11-NEXT: s_or_b32 s5, s5, s10 -; GFX11-NEXT: s_cmp_ge_u32 s4, 24 -; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: s_sub_i32 s10, s4, 24 -; GFX11-NEXT: s_cmp_lg_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s4, s10, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s4, 24 -; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: s_sub_i32 s10, s4, 24 -; GFX11-NEXT: s_cmp_lg_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s4, s10, s4 -; GFX11-NEXT: s_lshl_b32 s6, s6, 17 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_sub_i32 s8, 23, s4 -; GFX11-NEXT: s_or_b32 s0, s6, s0 -; GFX11-NEXT: s_mul_hi_u32 s6, s5, s9 -; GFX11-NEXT: s_lshl_b32 s0, s0, s8 -; GFX11-NEXT: s_mulk_i32 s6, 0xffe8 -; GFX11-NEXT: s_lshr_b32 s2, s2, s4 -; GFX11-NEXT: s_add_i32 s5, s5, s6 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_cmp_ge_u32 s5, 24 -; GFX11-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-NEXT: s_sub_i32 s4, s5, 24 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s2, 24 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-NEXT: s_sub_i32 s5, s2, 24 -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s2, s5, s2 -; GFX11-NEXT: s_lshl_b32 s4, s7, 17 -; GFX11-NEXT: s_lshl_b32 s1, s1, 1 -; GFX11-NEXT: s_sub_i32 s5, 23, s2 -; GFX11-NEXT: s_or_b32 s1, s4, s1 -; GFX11-NEXT: s_lshr_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s1, s1, s5 -; GFX11-NEXT: s_and_b32 s3, s0, 0xff -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_and_b32 s3, s1, 0xff -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_lshl_b32 s2, s3, 24 -; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_or_b32 s1, s1, s3 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshr_v2i24: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-TRUE16-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s8, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 24 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s3 +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s11, 0xff +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 24 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s10 +; GFX11-TRUE16-NEXT: s_mul_i32 s10, s13, 0xffffffe8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s10, s4, s13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_mulk_i32 s10, 0xffe8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s12, s5 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s14, 0xff +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s12, s4, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s12, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s8, s4, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s8, s5, s13 +; GFX11-TRUE16-NEXT: s_sub_i32 s6, 23, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_mulk_i32 s8, 0xffe8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s5, 24 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s4, s5, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_cmp_ge_u32 s2, 24 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-TRUE16-NEXT: s_sub_i32 s5, s2, 24 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-TRUE16-NEXT: s_sub_i32 s4, 23, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s3, 24 +; GFX11-TRUE16-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-TRUE16-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshr_v2i24: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s8, s1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s9, 0xff +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s2, 24 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s11, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s4, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s4, 24 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s8 +; GFX11-FAKE16-NEXT: s_mul_i32 s8, s9, 0xffffffe8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s10, s5 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s11, 0xff +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s8, s4, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_mulk_i32 s8, 0xffe8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s10, s4, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s8, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s10, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s4, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s10, s4, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s8, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s10, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_sub_i32 s8, 23, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s6, s0 +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s6, s5, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s8 +; GFX11-FAKE16-NEXT: s_mulk_i32 s6, 0xffe8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s5, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s4, s5, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_cmp_ge_u32 s2, 24 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-FAKE16-NEXT: s_sub_i32 s5, s2, 24 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s7, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-FAKE16-NEXT: s_sub_i32 s5, 23, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s4, s1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s3, 24 +; GFX11-FAKE16-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-FAKE16-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> %amt = bitcast i48 %amt.arg to <2 x i24> @@ -2774,13 +3046,21 @@ define amdgpu_ps i32 @s_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshr_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshr_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshr_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2868,11 +3148,17 @@ define i32 @v_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt) { ; GCN-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2933,10 +3219,15 @@ define amdgpu_ps float @v_fshr_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i32_ssv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i32_ssv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i32_ssv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2966,10 +3257,17 @@ define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i32_svs: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i32_svs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, v0, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i32_svs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -3003,12 +3301,19 @@ define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i32_vss: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i32_vss: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i32_vss: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -3022,12 +3327,19 @@ define <2 x i32> @v_fshr_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { ; GCN-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v2i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v2i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result } @@ -3041,13 +3353,21 @@ define <3 x i32> @v_fshr_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) { ; GCN-NEXT: v_alignbit_b32 v2, v2, v5, v8 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v3i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v3i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v6.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v4, v7.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, v2, v5, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v3i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) ret <3 x i32> %result } @@ -3062,14 +3382,23 @@ define <4 x i32> @v_fshr_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { ; GCN-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v4i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 -; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v4i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v4, v8.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v5, v9.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, v2, v6, v10.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, v3, v7, v11.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v4i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result } @@ -3272,19 +3601,33 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -3322,14 +3665,23 @@ define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i16_4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i16_4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 12, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 4, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i16_4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -3367,14 +3719,23 @@ define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_i16_5: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, 11, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, 5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_i16_5: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 11, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, 5, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_i16_5: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 11, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v1, 5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -3425,18 +3786,31 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i16_ssv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i16_ssv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v0.l, -1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 15 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, v0.h, s0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i16_ssv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, v1, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3484,17 +3858,29 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i16_svs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s2, s1, 15 -; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 -; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i16_svs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 15 +; GFX11-TRUE16-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i16_svs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 15 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, s2, v0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3545,17 +3931,29 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: v_fshr_i16_vss: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 -; GFX11-NEXT: s_and_b32 s1, s1, 15 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 -; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_fshr_i16_vss: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, s0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_fshr_i16_vss: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 15 +; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, s2, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -4291,51 +4689,91 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_fshr_v3i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s6, s0, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX11-NEXT: s_lshl_b32 s6, s6, 1 -; GFX11-NEXT: s_and_b32 s7, s4, 0xf000f -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 -; GFX11-NEXT: s_lshr_b32 s6, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s4, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, s4 -; GFX11-NEXT: s_lshl_b32 s4, s6, s8 -; GFX11-NEXT: s_and_b32 s6, s2, 0xffff -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_lshr_b32 s8, s7, 16 -; GFX11-NEXT: s_lshr_b32 s6, s6, s7 -; GFX11-NEXT: s_lshr_b32 s2, s2, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s4, s5, 0xf000f -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s2, s1, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x10001 -; GFX11-NEXT: s_lshl_b32 s2, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-NEXT: s_and_not1_b32 s2, 0xf000f, s5 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s5, s6 -; GFX11-NEXT: s_and_b32 s5, s3, 0xffff -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s3, s3, s6 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s3 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_or_b32 s0, s0, s3 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_fshr_v3i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s2, 0xffff +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s7, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 s4, 0xf000f, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s6, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xf000f +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_fshr_v3i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s6, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s2, 0xf000f, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) %cast = bitcast <3 x i16> %result to i48 ret i48 %cast @@ -7897,5 +8335,3 @@ declare i128 @llvm.fshr.i128(i128, i128, i128) #0 declare <2 x i128> @llvm.fshr.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0 attributes #0 = { nounwind readnone speculatable willreturn } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-FAKE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector-s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector-s16.mir index b13652d44eeb9..145e553f37204 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector-s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector-s16.mir @@ -1,10 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=amdgpu-regbanklegalize -mattr=+real-true16 %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s -# GFX11-ERR: LLVM ERROR: AMDGPU RegBankLegalize: none of the rules defined with 'Any' for MI's opcode matched MI: %4:sgpr(s32) = G_MERGE_VALUES %2:sgpr(s16), %3:sgpr(s16) (in function: test_merge_s16_into_s32_sgpr) +# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=amdgpu-regbanklegalize -mattr=+real-true16 -o - %s | FileCheck -check-prefix=GFX11 %s -# RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbanklegalize -mattr=+real-true16 %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s -# GFX12-ERR: LLVM ERROR: AMDGPU RegBankLegalize: none of the rules defined with 'Any' for MI's opcode matched MI: %4:sgpr(s32) = G_MERGE_VALUES %2:sgpr(s16), %3:sgpr(s16) (in function: test_merge_s16_into_s32_sgpr) +# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbanklegalize -mattr=+real-true16 -o - %s | FileCheck -check-prefix=GFX12 %s --- name: test_merge_s16_into_s32_sgpr @@ -15,6 +13,25 @@ body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GFX11-LABEL: name: test_merge_s16_into_s32_sgpr + ; GFX11: liveins: $sgpr0, $sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[MV:%[0-9]+]]:sgpr(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $sgpr0 = COPY [[MV]](s32) + ; + ; GFX12-LABEL: name: test_merge_s16_into_s32_sgpr + ; GFX12: liveins: $sgpr0, $sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX12-NEXT: [[MV:%[0-9]+]]:sgpr(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: $sgpr0 = COPY [[MV]](s32) %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s16) = G_TRUNC %0:sgpr(s32) @@ -32,6 +49,25 @@ body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GFX11-LABEL: name: test_build_s16_into_s32_sgpr + ; GFX11: liveins: $sgpr0, $sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $sgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; + ; GFX12-LABEL: name: test_build_s16_into_s32_sgpr + ; GFX12: liveins: $sgpr0, $sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: $sgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s16) = G_TRUNC %0:sgpr(s32) @@ -49,6 +85,25 @@ body: | bb.0: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX11-LABEL: name: test_merge_s16_into_s32_vgpr + ; GFX11: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[MV:%[0-9]+]]:vgpr(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[MV]](s32) + ; + ; GFX12-LABEL: name: test_merge_s16_into_s32_vgpr + ; GFX12: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: $vgpr0 = COPY [[MV]](s32) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0:vgpr(s32) @@ -66,6 +121,25 @@ body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: test_build_s16_into_s32_vgpr + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; + ; GFX12-LABEL: name: test_build_s16_into_s32_vgpr + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0:vgpr(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index d9403a96cf6da..2fae211de43fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s ; FIXME: regbankcombiner regression, related to: @@ -458,12 +458,26 @@ define i16 @v_orn2_i16(i16 %src0, i16 %src1) { ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_orn2_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_orn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_orn2_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v1.l, -1 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_orn2_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or @@ -477,12 +491,26 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) { ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: v_orn2_i16_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: v_orn2_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: v_orn2_i16_sv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, -1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_orn2_i16_sv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -498,12 +526,26 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) { ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: v_orn2_i16_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: v_orn2_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s0, s2, -1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: v_orn2_i16_vs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s2, -1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, s0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_orn2_i16_vs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s2, -1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -690,17 +732,38 @@ define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_orn2_v3i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_orn2_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_orn2_v3i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_orn2_v3i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %or = or <3 x i16> %src0, %not.src1 %cast = bitcast <3 x i16> %or to i48 @@ -743,17 +806,38 @@ define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inre ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_orn2_v3i16_commute: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_orn2_v3i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_orn2_v3i16_commute: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_orn2_v3i16_commute: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %or = or <3 x i16> %not.src1, %src0 %cast = bitcast <3 x i16> %or to i48 @@ -806,22 +890,51 @@ define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 ; GFX9-NEXT: s_and_b32 s3, s5, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_orn2_v3i16_multi_use: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 -; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3 -; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_orn2_v3i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s3, s5, 0xffff +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_orn2_v3i16_multi_use: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-TRUE16-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s5 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_orn2_v3i16_multi_use: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-FAKE16-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-FAKE16-NEXT: ; return to shader part epilog %not.src1 = xor <3 x i16> %src1, %or = or <3 x i16> %src0, %not.src1 %cast.0 = bitcast <3 x i16> %or to i48 @@ -1125,5 +1238,3 @@ define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) { %or = or <4 x i16> %src0, %not.src1 ret <4 x i16> %or } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-FAKE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-merge-values-build-vector-s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-merge-values-build-vector-s16.mir index ebb3ab0521fde..211f7fffe8900 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-merge-values-build-vector-s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-merge-values-build-vector-s16.mir @@ -1,10 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s -# GFX11-ERR: LLVM ERROR: cannot select: %4:sreg_32(s32) = G_MERGE_VALUES %2:sreg_32(s16), %3:sreg_32(s16) (in function: test_merge_s16_into_s32_sgpr) +# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -o - %s | FileCheck -check-prefix=GFX11 %s # -# RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=instruction-select -mattr=+real-true16 %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s -# GFX12-ERR: LLVM ERROR: cannot select: %4:sreg_32(s32) = G_MERGE_VALUES %2:sreg_32(s16), %3:sreg_32(s16) (in function: test_merge_s16_into_s32_sgpr) +# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=instruction-select -mattr=+real-true16 -o - %s | FileCheck -check-prefix=GFX12 %s --- name: test_merge_s16_into_s32_sgpr @@ -15,6 +13,21 @@ body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GFX11-LABEL: name: test_merge_s16_into_s32_sgpr + ; GFX11: liveins: $sgpr0, $sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[COPY1]] + ; GFX11-NEXT: $sgpr0 = COPY [[S_PACK_LL_B32_B16_]] + ; + ; GFX12-LABEL: name: test_merge_s16_into_s32_sgpr + ; GFX12: liveins: $sgpr0, $sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX12-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[COPY1]] + ; GFX12-NEXT: $sgpr0 = COPY [[S_PACK_LL_B32_B16_]] %0:sreg_32(s32) = COPY $sgpr0 %1:sreg_32(s32) = COPY $sgpr1 %2:sreg_32(s16) = G_TRUNC %0:sreg_32(s32) @@ -32,6 +45,21 @@ body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GFX11-LABEL: name: test_build_s16_into_s32_sgpr + ; GFX11: liveins: $sgpr0, $sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[COPY1]] + ; GFX11-NEXT: $sgpr0 = COPY [[S_PACK_LL_B32_B16_]] + ; + ; GFX12-LABEL: name: test_build_s16_into_s32_sgpr + ; GFX12: liveins: $sgpr0, $sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX12-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[COPY1]] + ; GFX12-NEXT: $sgpr0 = COPY [[S_PACK_LL_B32_B16_]] %0:sreg_32(s32) = COPY $sgpr0 %1:sreg_32(s32) = COPY $sgpr1 %2:sreg_32(s16) = G_TRUNC %0:sreg_32(s32) @@ -49,6 +77,25 @@ body: | bb.0: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX11-LABEL: name: test_merge_s16_into_s32_vgpr + ; GFX11: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY2]], %subreg.lo16, [[COPY3]], %subreg.hi16 + ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX12-LABEL: name: test_merge_s16_into_s32_vgpr + ; GFX12: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY2]], %subreg.lo16, [[COPY3]], %subreg.hi16 + ; GFX12-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] %0:vgpr_32(s32) = COPY $vgpr0 %1:vgpr_32(s32) = COPY $vgpr1 %2:vgpr_16(s16) = G_TRUNC %0:vgpr_32(s32) @@ -66,6 +113,25 @@ body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: test_build_s16_into_s32_vgpr + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY2]], %subreg.lo16, [[COPY3]], %subreg.hi16 + ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX12-LABEL: name: test_build_s16_into_s32_vgpr + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY2]], %subreg.lo16, [[COPY3]], %subreg.hi16 + ; GFX12-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] %0:vgpr_32(s32) = COPY $vgpr0 %1:vgpr_32(s32) = COPY $vgpr1 %2:vgpr_16(s16) = G_TRUNC %0:vgpr_32(s32)