[X86] combineVectorSizedSetCCEquality - allow 256/512-bit vector icmp_ne/eq zero comparisons #163373

RKSimon · 2025-10-14T11:23:10Z

We avoid creating vector movmsk/ptest comparisons with zero if we can just use scalar OR instead, but this doesn't make sense for 256-bit or larger vectors which creates a more complex OR chain.

This more closely matches what we do for icmp_ne/eq against non-zero values.

I'm hoping that we can eventually allow even larger vectors to be handled with a OR/AND chains - but for now this just allows us to handle legal 256/512-bit vector widths.

…_ne/eq zero comparisons We avoid creating vector movmsk/ptest comparisons with zero if we can just use scalar OR instead, but this doesn't make sense for 256-bit or larger vectors which creates a more complex OR chain. This more closely matches what we do for icmp_ne/eq against non-zero values. I'm hoping that we can eventually allow even larger vectors to be handled with a OR/AND chains - but for now this just allows us to handle legal 256/512-bit vector widths.

llvmbot · 2025-10-14T12:16:27Z

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

We avoid creating vector movmsk/ptest comparisons with zero if we can just use scalar OR instead, but this doesn't make sense for 256-bit or larger vectors which creates a more complex OR chain.

This more closely matches what we do for icmp_ne/eq against non-zero values.

I'm hoping that we can eventually allow even larger vectors to be handled with a OR/AND chains - but for now this just allows us to handle legal 256/512-bit vector widths.

Full diff: https://github.com/llvm/llvm-project/pull/163373.diff

2 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1)
(modified) llvm/test/CodeGen/X86/setcc-wide-types.ll (+41-24)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eea84a2841764..edaf20a1aa436 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22856,7 +22856,7 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // be generated by the memcmp expansion pass with oversized integer compares
   // (see PR33325).
   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
-  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+  if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
     return SDValue();
 
   // Don't perform this combine if constructing the vector will be expensive.
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 69abf6e0bec35..d018c535ea8f7 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1493,15 +1493,23 @@ define i1 @allbits_i128_load_arg(ptr %w) {
 }
 
 define i1 @anybits_i256_load_arg(ptr %w) {
-; ANY-LABEL: anybits_i256_load_arg:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq (%rdi), %rax
-; ANY-NEXT:    movq 8(%rdi), %rcx
-; ANY-NEXT:    orq 24(%rdi), %rcx
-; ANY-NEXT:    orq 16(%rdi), %rax
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    setne %al
-; ANY-NEXT:    retq
+; SSE-LABEL: anybits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    orq 24(%rdi), %rcx
+; SSE-NEXT:    orq 16(%rdi), %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
+;
+; AVXANY-LABEL: anybits_i256_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqu (%rdi), %ymm0
+; AVXANY-NEXT:    vptest %ymm0, %ymm0
+; AVXANY-NEXT:    setne %al
+; AVXANY-NEXT:    vzeroupper
+; AVXANY-NEXT:    retq
   %ld = load i256, ptr %w
   %cmp = icmp ne i256 %ld, 0
   ret i1 %cmp
@@ -1552,21 +1560,30 @@ define i1 @allbits_i256_load_arg(ptr %w) {
 }
 
 define i1 @anybits_i512_load_arg(ptr %w) {
-; ANY-LABEL: anybits_i512_load_arg:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq 16(%rdi), %rax
-; ANY-NEXT:    movq (%rdi), %rcx
-; ANY-NEXT:    movq 8(%rdi), %rdx
-; ANY-NEXT:    movq 24(%rdi), %rsi
-; ANY-NEXT:    orq 56(%rdi), %rsi
-; ANY-NEXT:    orq 40(%rdi), %rdx
-; ANY-NEXT:    orq %rsi, %rdx
-; ANY-NEXT:    orq 48(%rdi), %rax
-; ANY-NEXT:    orq 32(%rdi), %rcx
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    setne %al
-; ANY-NEXT:    retq
+; NO512-LABEL: anybits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    orq 56(%rdi), %rsi
+; NO512-NEXT:    orq 40(%rdi), %rdx
+; NO512-NEXT:    orq %rsi, %rdx
+; NO512-NEXT:    orq 48(%rdi), %rax
+; NO512-NEXT:    orq 32(%rdi), %rcx
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    setne %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: anybits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %ld = load i512, ptr %w
   %cmp = icmp ne i512 %ld, 0
   ret i1 %cmp

phoebewang

LGTM.

…_ne/eq zero comparisons (llvm#163373) We avoid creating vector movmsk/ptest comparisons with zero if we can just use scalar OR instead, but this doesn't make sense for 256-bit or larger vectors which creates a more complex OR chain. This more closely matches what we do for icmp_ne/eq against non-zero values. I'm hoping that we can eventually allow even larger vectors to be handled with a OR/AND chains - but for now this just allows us to handle legal 256/512-bit vector widths.

RKSimon requested a review from phoebewang October 14, 2025 11:23

llvmbot added the backend:X86 label Oct 14, 2025

phoebewang approved these changes Oct 14, 2025

View reviewed changes

Merge branch 'main' into x86-widevec-is-zero

4b82ad5

RKSimon enabled auto-merge (squash) October 14, 2025 14:49

RKSimon merged commit f435930 into llvm:main Oct 14, 2025
9 of 10 checks passed

RKSimon deleted the x86-widevec-is-zero branch October 15, 2025 18:03

RKSimon mentioned this pull request Oct 22, 2025

[X86] Poor codegen for NE/EQ zero/allones comparisons for very large integers #164632

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[X86] combineVectorSizedSetCCEquality - allow 256/512-bit vector icmp_ne/eq zero comparisons #163373

[X86] combineVectorSizedSetCCEquality - allow 256/512-bit vector icmp_ne/eq zero comparisons #163373

Uh oh!

RKSimon commented Oct 14, 2025

Uh oh!

llvmbot commented Oct 14, 2025

Uh oh!

phoebewang left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[X86] combineVectorSizedSetCCEquality - allow 256/512-bit vector icmp_ne/eq zero comparisons #163373

[X86] combineVectorSizedSetCCEquality - allow 256/512-bit vector icmp_ne/eq zero comparisons #163373

Uh oh!

Conversation

RKSimon commented Oct 14, 2025

Uh oh!

llvmbot commented Oct 14, 2025

Uh oh!

phoebewang left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants