Skip to content

Commit

Permalink
[X86] combinePredicateReduction - split vXi16 allof(cmpeq()) to vXi8 …
Browse files Browse the repository at this point in the history
…allof(cmpeq())

vXi16 patterns allof(cmp()) reduction patterns will have to be pack the comparison results to vXi8 to use PMOVMSKB.

If we're reducing cmpeq(), then we can compare the vXi8 halves directly - similar to what we already do for vXi64 -> vXi32 for cases without PCMPEQQ.
  • Loading branch information
RKSimon committed Jan 24, 2022
1 parent 4cfea31 commit 11bb4a1
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 68 deletions.
15 changes: 9 additions & 6 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -42215,18 +42215,21 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
// For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
// PCMPEQQ (SSE41+), use PCMPEQD instead.
if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
// For all_of(setcc(vec,0,eq))
// - avoid vXi64 comparisons without PCMPEQQ (SSE41+), use PCMPEQD.
// - avoid vXi16 comparisons, use PMOVMSKB(PCMPEQB()).
if (BinOp == ISD::AND &&
Match.getOpcode() == ISD::SETCC &&
ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
ISD::CondCode::SETEQ) {
SDValue Vec = Match.getOperand(0);
if (Vec.getValueType().getScalarType() == MVT::i64 &&
(2 * NumElts) <= MaxElts) {
EVT VecSVT = Vec.getValueType().getScalarType();
if ((VecSVT == MVT::i16 && !Subtarget.hasBWI()) ||
(VecSVT == MVT::i64 && !Subtarget.hasSSE41())) {
NumElts *= 2;
EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
VecSVT = VecSVT.getHalfSizedIntegerVT(*DAG.getContext());
EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumElts);
MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
Match = DAG.getSetCC(
DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
Expand Down
121 changes: 59 additions & 62 deletions llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
Expand Up @@ -938,23 +938,24 @@ define i1 @icmp0_v4i32_v4i1(<4 x i32>) {
}

define i1 @icmp0_v8i16_v8i1(<8 x i16>) {
; SSE-LABEL: icmp0_v8i16_v8i1:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pcmpeqw %xmm0, %xmm1
; SSE-NEXT: packsswb %xmm1, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
; SSE-NEXT: cmpb $-1, %al
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; SSE2-LABEL: icmp0_v8i16_v8i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: icmp0_v8i16_v8i1:
; SSE41: # %bb.0:
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX-LABEL: icmp0_v8i16_v8i1:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: cmpb $-1, %al
; AVX-NEXT: vptest %xmm0, %xmm0
; AVX-NEXT: sete %al
; AVX-NEXT: retq
;
Expand Down Expand Up @@ -1184,26 +1185,28 @@ define i1 @icmp0_v8i32_v8i1(<8 x i32>) {
}

define i1 @icmp0_v16i16_v16i1(<16 x i16>) {
; SSE-LABEL: icmp0_v16i16_v16i1:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pcmpeqw %xmm2, %xmm1
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; SSE2-LABEL: icmp0_v16i16_v16i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: icmp0_v16i16_v16i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: icmp0_v16i16_v16i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vptest %xmm0, %xmm0
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
Expand Down Expand Up @@ -1477,47 +1480,41 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) {
}

define i1 @icmp0_v32i16_v32i1(<32 x i16>) {
; SSE-LABEL: icmp0_v32i16_v32i1:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pcmpeqw %xmm4, %xmm1
; SSE-NEXT: pcmpeqw %xmm4, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: pcmpeqw %xmm4, %xmm3
; SSE-NEXT: pcmpeqw %xmm4, %xmm2
; SSE-NEXT: packsswb %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pmovmskb %xmm2, %eax
; SSE-NEXT: cmpw $-1, %ax
; SSE-NEXT: sete %al
; SSE-NEXT: retq
; SSE2-LABEL: icmp0_v32i16_v32i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pcmpeqb %xmm4, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: cmpw $-1, %ax
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: icmp0_v32i16_v32i1:
; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: ptest %xmm1, %xmm1
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: icmp0_v32i16_v32i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: cmpw $-1, %ax
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vptest %xmm0, %xmm0
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: icmp0_v32i16_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: cmpl $-1, %eax
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Expand Down

0 comments on commit 11bb4a1

Please sign in to comment.