diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ead00a9d2015a..b4299d541079d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42625,6 +42625,34 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; + // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y). + // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X). + // iff the upper elements of the non-shifted arg are zero. + // KUNPCK require 16+ bool vector elements. + if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts / 2; + APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts); + if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && + N1.getConstantOperandAPInt(1) == HalfElts && + DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) { + SDLoc dl(N); + return DAG.getNode( + ISD::CONCAT_VECTORS, dl, VT, + extractSubVector(N0, 0, DAG, dl, HalfElts), + extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts)); + } + if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && + N0.getConstantOperandAPInt(1) == HalfElts && + DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) { + SDLoc dl(N); + return DAG.getNode( + ISD::CONCAT_VECTORS, dl, VT, + extractSubVector(N1, 0, DAG, dl, HalfElts), + extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts)); + } + } + // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 618505865e6c7..2cc84c105125e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -892,10 +892,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0 ; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k1, %k1 -; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: kunpckbw %k1, %k0, %k1 ; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -905,8 +902,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1 -; AVX512VL-NEXT: kshiftlw $8, %k0, %k0 -; AVX512VL-NEXT: korw %k1, %k0, %k1 +; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1 ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper @@ -916,8 +912,7 @@ define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> ; VL_BW_DQ: # %bb.0: # %entry ; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0 ; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1 -; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0 -; VL_BW_DQ-NEXT: korw %k1, %k0, %k1 +; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1 ; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; VL_BW_DQ-NEXT: vzeroupper @@ -945,10 +940,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0 ; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k1, %k1 -; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: korw %k0, %k1, %k1 +; AVX512F-NEXT: kunpckbw %k1, %k0, %k1 ; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -958,8 +950,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1 -; AVX512VL-NEXT: kshiftlw $8, %k0, %k0 -; AVX512VL-NEXT: korw %k0, %k1, %k1 +; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1 ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper @@ -969,8 +960,7 @@ define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f ; VL_BW_DQ: # %bb.0: # %entry ; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0 ; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1 -; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0 -; VL_BW_DQ-NEXT: korw %k0, %k1, %k1 +; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1 ; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; VL_BW_DQ-NEXT: vzeroupper