diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bedec0c8974a8..3a51c7c2ca854 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29830,6 +29830,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, if (VT.isVector()) { APInt APIntShiftAmt; bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); + unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { if (IsFSHR) @@ -29858,6 +29859,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt; uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt); + assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift"); + + if (EltSizeInBits == 8 && ShXAmt > 1 && + (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) { + // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG + // bit-select - lower using vXi16 shifts and then perform the bitmask at + // the original vector width to handle cases where we split. + MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt); + APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt); + SDValue ShX = + DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0), + DAG.getShiftAmountConstant(ShXAmt, WideVT, DL)); + SDValue ShY = + DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1), + DAG.getShiftAmountConstant(ShYAmt, WideVT, DL)); + ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX), + DAG.getConstant(MaskX, DL, VT)); + ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY), + DAG.getConstant(MaskY, DL, VT)); + return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); + } + SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0, DAG.getShiftAmountConstant(ShXAmt, VT, DL)); SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1, @@ -29874,7 +29898,6 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; - unsigned NumElts = VT.getVectorNumElements(); MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 1addedf3c3d96..0459d47eed817 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2453,9 +2453,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index ebcb1cb15a600..e81b9adfdd3e3 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2344,17 +2344,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 638a3cdaa2c1d..b839452725a95 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2462,9 +2462,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 3fabf720da71c..7b6b0ea83c7ee 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2145,17 +2145,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: