Skip to content

Commit

Permalink
[x86] allow pairs of PCMPEQ for vector-sized integer equality compari…
Browse files Browse the repository at this point in the history
…sons (PR33325)

This is an extension of D31156 with the goal that we'll allow memcmp() == 0 expansion 
for x86 to use 2 pairs of loads per block.

The memcmp expansion pass (formerly part of CGP) will generate this kind of pattern 
with oversized integer compares, so we want to transform these into x86-specific vector
nodes before legalization splits things into scalar chunks.

See PR33325 for more details:
https://bugs.llvm.org/show_bug.cgi?id=33325

Differential Revision: https://reviews.llvm.org/D41618

llvm-svn: 321656
  • Loading branch information
rotateright committed Jan 2, 2018
1 parent 854d10d commit 9a80871
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 95 deletions.
38 changes: 31 additions & 7 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -36316,13 +36316,23 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");

// We're looking for an oversized integer equality comparison, but ignore a
// comparison with zero because that gets special treatment in EmitTest().
// We're looking for an oversized integer equality comparison.
SDValue X = SetCC->getOperand(0);
SDValue Y = SetCC->getOperand(1);
EVT OpVT = X.getValueType();
unsigned OpSize = OpVT.getSizeInBits();
if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();

// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
X.getOperand(0).getOpcode() == ISD::XOR &&
X.getOperand(1).getOpcode() == ISD::XOR;
if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();

// Bail out if we know that this is not really just an oversized integer.
Expand All @@ -36337,15 +36347,29 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && Subtarget.hasAVX2())) {
EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);

SDValue Cmp;
if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
// Use 2 vector equality compares and 'and' the results before doing a
// MOVMSK.
SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
} else {
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
}
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
MVT::i32);
Expand Down
284 changes: 196 additions & 88 deletions llvm/test/CodeGen/X86/setcc-wide-types.ll
Expand Up @@ -193,22 +193,33 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; if we allowed 2 pairs of 16-byte loads per block.

define i32 @ne_i128_pair(i128* %a, i128* %b) {
; ANY-LABEL: ne_i128_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq (%rdi), %rax
; ANY-NEXT: movq 8(%rdi), %rcx
; ANY-NEXT: xorq (%rsi), %rax
; ANY-NEXT: xorq 8(%rsi), %rcx
; ANY-NEXT: movq 24(%rdi), %rdx
; ANY-NEXT: movq 16(%rdi), %rdi
; ANY-NEXT: xorq 16(%rsi), %rdi
; ANY-NEXT: orq %rax, %rdi
; ANY-NEXT: xorq 24(%rsi), %rdx
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rdi, %rdx
; ANY-NEXT: setne %al
; ANY-NEXT: retq
; SSE2-LABEL: ne_i128_pair:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
; SSE2-NEXT: movdqu (%rsi), %xmm2
; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; SSE2-NEXT: movdqu 16(%rsi), %xmm0
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; AVXANY-LABEL: ne_i128_pair:
; AVXANY: # %bb.0:
; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
; AVXANY-NEXT: xorl %eax, %eax
; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; AVXANY-NEXT: setne %al
; AVXANY-NEXT: retq
%a0 = load i128, i128* %a
%b0 = load i128, i128* %b
%xor1 = xor i128 %a0, %b0
Expand All @@ -227,22 +238,33 @@ define i32 @ne_i128_pair(i128* %a, i128* %b) {
; if we allowed 2 pairs of 16-byte loads per block.

define i32 @eq_i128_pair(i128* %a, i128* %b) {
; ANY-LABEL: eq_i128_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq (%rdi), %rax
; ANY-NEXT: movq 8(%rdi), %rcx
; ANY-NEXT: xorq (%rsi), %rax
; ANY-NEXT: xorq 8(%rsi), %rcx
; ANY-NEXT: movq 24(%rdi), %rdx
; ANY-NEXT: movq 16(%rdi), %rdi
; ANY-NEXT: xorq 16(%rsi), %rdi
; ANY-NEXT: orq %rax, %rdi
; ANY-NEXT: xorq 24(%rsi), %rdx
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rdi, %rdx
; ANY-NEXT: sete %al
; ANY-NEXT: retq
; SSE2-LABEL: eq_i128_pair:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
; SSE2-NEXT: movdqu (%rsi), %xmm2
; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; SSE2-NEXT: movdqu 16(%rsi), %xmm0
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVXANY-LABEL: eq_i128_pair:
; AVXANY: # %bb.0:
; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
; AVXANY-NEXT: xorl %eax, %eax
; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; AVXANY-NEXT: sete %al
; AVXANY-NEXT: retq
%a0 = load i128, i128* %a
%b0 = load i128, i128* %b
%xor1 = xor i128 %a0, %b0
Expand All @@ -261,34 +283,77 @@ define i32 @eq_i128_pair(i128* %a, i128* %b) {
; if we allowed 2 pairs of 32-byte loads per block.

define i32 @ne_i256_pair(i256* %a, i256* %b) {
; ANY-LABEL: ne_i256_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq 16(%rdi), %r9
; ANY-NEXT: movq 24(%rdi), %r11
; ANY-NEXT: movq (%rdi), %r8
; ANY-NEXT: movq 8(%rdi), %r10
; ANY-NEXT: xorq 8(%rsi), %r10
; ANY-NEXT: xorq 24(%rsi), %r11
; ANY-NEXT: xorq (%rsi), %r8
; ANY-NEXT: xorq 16(%rsi), %r9
; ANY-NEXT: movq 48(%rdi), %rdx
; ANY-NEXT: movq 32(%rdi), %rax
; ANY-NEXT: movq 56(%rdi), %rcx
; ANY-NEXT: movq 40(%rdi), %rdi
; ANY-NEXT: xorq 40(%rsi), %rdi
; ANY-NEXT: xorq 56(%rsi), %rcx
; ANY-NEXT: orq %r11, %rcx
; ANY-NEXT: orq %rdi, %rcx
; ANY-NEXT: orq %r10, %rcx
; ANY-NEXT: xorq 32(%rsi), %rax
; ANY-NEXT: xorq 48(%rsi), %rdx
; ANY-NEXT: orq %r9, %rdx
; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: orq %r8, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: setne %al
; ANY-NEXT: retq
; SSE2-LABEL: ne_i256_pair:
; SSE2: # %bb.0:
; SSE2-NEXT: movq 16(%rdi), %r9
; SSE2-NEXT: movq 24(%rdi), %r11
; SSE2-NEXT: movq (%rdi), %r8
; SSE2-NEXT: movq 8(%rdi), %r10
; SSE2-NEXT: xorq 8(%rsi), %r10
; SSE2-NEXT: xorq 24(%rsi), %r11
; SSE2-NEXT: xorq (%rsi), %r8
; SSE2-NEXT: xorq 16(%rsi), %r9
; SSE2-NEXT: movq 48(%rdi), %rdx
; SSE2-NEXT: movq 32(%rdi), %rax
; SSE2-NEXT: movq 56(%rdi), %rcx
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
; SSE2-NEXT: xorq 56(%rsi), %rcx
; SSE2-NEXT: orq %r11, %rcx
; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: orq %r10, %rcx
; SSE2-NEXT: xorq 32(%rsi), %rax
; SSE2-NEXT: xorq 48(%rsi), %rdx
; SSE2-NEXT: orq %r9, %rdx
; SSE2-NEXT: orq %rax, %rdx
; SSE2-NEXT: orq %r8, %rdx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; AVX1-LABEL: ne_i256_pair:
; AVX1: # %bb.0:
; AVX1-NEXT: movq 16(%rdi), %r9
; AVX1-NEXT: movq 24(%rdi), %r11
; AVX1-NEXT: movq (%rdi), %r8
; AVX1-NEXT: movq 8(%rdi), %r10
; AVX1-NEXT: xorq 8(%rsi), %r10
; AVX1-NEXT: xorq 24(%rsi), %r11
; AVX1-NEXT: xorq (%rsi), %r8
; AVX1-NEXT: xorq 16(%rsi), %r9
; AVX1-NEXT: movq 48(%rdi), %rdx
; AVX1-NEXT: movq 32(%rdi), %rax
; AVX1-NEXT: movq 56(%rdi), %rcx
; AVX1-NEXT: movq 40(%rdi), %rdi
; AVX1-NEXT: xorq 40(%rsi), %rdi
; AVX1-NEXT: xorq 56(%rsi), %rcx
; AVX1-NEXT: orq %r11, %rcx
; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: orq %r10, %rcx
; AVX1-NEXT: xorq 32(%rsi), %rax
; AVX1-NEXT: xorq 48(%rsi), %rdx
; AVX1-NEXT: orq %r9, %rdx
; AVX1-NEXT: orq %rax, %rdx
; AVX1-NEXT: orq %r8, %rdx
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: setne %al
; AVX1-NEXT: retq
;
; AVX256-LABEL: ne_i256_pair:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqu (%rdi), %ymm0
; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpmovmskb %ymm0, %ecx
; AVX256-NEXT: xorl %eax, %eax
; AVX256-NEXT: cmpl $-1, %ecx
; AVX256-NEXT: setne %al
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
%a0 = load i256, i256* %a
%b0 = load i256, i256* %b
%xor1 = xor i256 %a0, %b0
Expand All @@ -307,34 +372,77 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
; if we allowed 2 pairs of 32-byte loads per block.

define i32 @eq_i256_pair(i256* %a, i256* %b) {
; ANY-LABEL: eq_i256_pair:
; ANY: # %bb.0:
; ANY-NEXT: movq 16(%rdi), %r9
; ANY-NEXT: movq 24(%rdi), %r11
; ANY-NEXT: movq (%rdi), %r8
; ANY-NEXT: movq 8(%rdi), %r10
; ANY-NEXT: xorq 8(%rsi), %r10
; ANY-NEXT: xorq 24(%rsi), %r11
; ANY-NEXT: xorq (%rsi), %r8
; ANY-NEXT: xorq 16(%rsi), %r9
; ANY-NEXT: movq 48(%rdi), %rdx
; ANY-NEXT: movq 32(%rdi), %rax
; ANY-NEXT: movq 56(%rdi), %rcx
; ANY-NEXT: movq 40(%rdi), %rdi
; ANY-NEXT: xorq 40(%rsi), %rdi
; ANY-NEXT: xorq 56(%rsi), %rcx
; ANY-NEXT: orq %r11, %rcx
; ANY-NEXT: orq %rdi, %rcx
; ANY-NEXT: orq %r10, %rcx
; ANY-NEXT: xorq 32(%rsi), %rax
; ANY-NEXT: xorq 48(%rsi), %rdx
; ANY-NEXT: orq %r9, %rdx
; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: orq %r8, %rdx
; ANY-NEXT: xorl %eax, %eax
; ANY-NEXT: orq %rcx, %rdx
; ANY-NEXT: sete %al
; ANY-NEXT: retq
; SSE2-LABEL: eq_i256_pair:
; SSE2: # %bb.0:
; SSE2-NEXT: movq 16(%rdi), %r9
; SSE2-NEXT: movq 24(%rdi), %r11
; SSE2-NEXT: movq (%rdi), %r8
; SSE2-NEXT: movq 8(%rdi), %r10
; SSE2-NEXT: xorq 8(%rsi), %r10
; SSE2-NEXT: xorq 24(%rsi), %r11
; SSE2-NEXT: xorq (%rsi), %r8
; SSE2-NEXT: xorq 16(%rsi), %r9
; SSE2-NEXT: movq 48(%rdi), %rdx
; SSE2-NEXT: movq 32(%rdi), %rax
; SSE2-NEXT: movq 56(%rdi), %rcx
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
; SSE2-NEXT: xorq 56(%rsi), %rcx
; SSE2-NEXT: orq %r11, %rcx
; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: orq %r10, %rcx
; SSE2-NEXT: xorq 32(%rsi), %rax
; SSE2-NEXT: xorq 48(%rsi), %rdx
; SSE2-NEXT: orq %r9, %rdx
; SSE2-NEXT: orq %rax, %rdx
; SSE2-NEXT: orq %r8, %rdx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX1-LABEL: eq_i256_pair:
; AVX1: # %bb.0:
; AVX1-NEXT: movq 16(%rdi), %r9
; AVX1-NEXT: movq 24(%rdi), %r11
; AVX1-NEXT: movq (%rdi), %r8
; AVX1-NEXT: movq 8(%rdi), %r10
; AVX1-NEXT: xorq 8(%rsi), %r10
; AVX1-NEXT: xorq 24(%rsi), %r11
; AVX1-NEXT: xorq (%rsi), %r8
; AVX1-NEXT: xorq 16(%rsi), %r9
; AVX1-NEXT: movq 48(%rdi), %rdx
; AVX1-NEXT: movq 32(%rdi), %rax
; AVX1-NEXT: movq 56(%rdi), %rcx
; AVX1-NEXT: movq 40(%rdi), %rdi
; AVX1-NEXT: xorq 40(%rsi), %rdi
; AVX1-NEXT: xorq 56(%rsi), %rcx
; AVX1-NEXT: orq %r11, %rcx
; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: orq %r10, %rcx
; AVX1-NEXT: xorq 32(%rsi), %rax
; AVX1-NEXT: xorq 48(%rsi), %rdx
; AVX1-NEXT: orq %r9, %rdx
; AVX1-NEXT: orq %rax, %rdx
; AVX1-NEXT: orq %r8, %rdx
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: sete %al
; AVX1-NEXT: retq
;
; AVX256-LABEL: eq_i256_pair:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqu (%rdi), %ymm0
; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpmovmskb %ymm0, %ecx
; AVX256-NEXT: xorl %eax, %eax
; AVX256-NEXT: cmpl $-1, %ecx
; AVX256-NEXT: sete %al
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
%a0 = load i256, i256* %a
%b0 = load i256, i256* %b
%xor1 = xor i256 %a0, %b0
Expand Down

0 comments on commit 9a80871

Please sign in to comment.