Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[X86] ptest is commutable as long as only the Z flag is used. #88969

Merged
merged 4 commits into from
Apr 18, 2024

Conversation

topperc
Copy link
Collaborator

@topperc topperc commented Apr 16, 2024

Fixes #88958.

@llvmbot
Copy link
Collaborator

llvmbot commented Apr 16, 2024

@llvm/pr-subscribers-backend-x86

Author: Craig Topper (topperc)

Changes

Fixes #88958.


Full diff: https://github.com/llvm/llvm-project/pull/88969.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86InstrSSE.td (+18)
  • (modified) llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll (+19-13)
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 5d799fc00df92b..385d65d0bcaaf6 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5688,6 +5688,13 @@ let Predicates = [UseSSE41, OptForSize] in {
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
 
+// ptest is commutable if only the Z flag is used. If the C flag is used,
+// commuting would change which operand is inverted.
+def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2),
+                                  (X86ptest node:$src1, node:$src2), [{
+  return onlyUsesZeroFlag(SDValue(Node, 0));
+}]>;
+
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -5723,6 +5730,17 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
               Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1),
+            (VPTESTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1),
+            (VPTESTYrm VR256:$src1, addr:$src2)>;
+}
+let Predicates = [UseSSE41] in {
+  def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1),
+            (PTESTrm VR128:$src1, addr:$src2)>;
+}
+
 // The bit test instructions below are AVX only
 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
                        X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index f8ba00b0332994..9cd0f4d12e15ab 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1018,32 +1018,38 @@ define zeroext i1 @PR44781(ptr %0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: PR44781:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1OR2-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; AVX1OR2-NEXT:    sete %al
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: PR44781:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm0 = [15,15,15,15]
+; AVX1-NEXT:    vptest (%rdi), %xmm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR44781:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX2-NEXT:    vptest (%rdi), %xmm0
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: PR44781:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512F-NEXT:    vptest (%rdi), %xmm0
 ; AVX512F-NEXT:    sete %al
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: PR44781:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512BW-NEXT:    vptest (%rdi), %xmm0
 ; AVX512BW-NEXT:    sete %al
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: PR44781:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455]
-; AVX512BWVL-NEXT:    vptest %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455]
+; AVX512BWVL-NEXT:    vptest (%rdi), %xmm0
 ; AVX512BWVL-NEXT:    sete %al
 ; AVX512BWVL-NEXT:    retq
   %2 = load <4 x i32>, ptr %0, align 4

@topperc
Copy link
Collaborator Author

topperc commented Apr 16, 2024

I guess this still needs SSE 4.1 coverage. @phoebewang do you want to pick this up?

@phoebewang
Copy link
Contributor

I guess this still needs SSE 4.1 coverage. @phoebewang do you want to pick this up?

Done by 17b86d5, please rebase.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No objections from me - technically we're still missing the ability to fold some loads that appear later (from stack spills etc.) but short of making a full pseudo instruction I'm not sure whether we can do it cleanly.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also - what about TESTPS/TESTPD?

@@ -5737,6 +5755,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
}

// ptest is commutable if only the Z flag is used. If the C flag is used,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

testp

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd spell it testps/testpd or testpX

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

@topperc topperc merged commit 254cdcd into llvm:main Apr 18, 2024
3 of 4 checks passed
@topperc topperc deleted the pr/commute-test branch April 18, 2024 16:50
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
6 participants