-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] ptest is commutable as long as only the Z flag is used. #88969
Conversation
@llvm/pr-subscribers-backend-x86 Author: Craig Topper (topperc) ChangesFixes #88958. Full diff: https://github.com/llvm/llvm-project/pull/88969.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 5d799fc00df92b..385d65d0bcaaf6 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5688,6 +5688,13 @@ let Predicates = [UseSSE41, OptForSize] in {
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
+// ptest is commutable if only the Z flag is used. If the C flag is used,
+// commuting would change which operand is inverted.
+def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2),
+ (X86ptest node:$src1, node:$src2), [{
+ return onlyUsesZeroFlag(SDValue(Node, 0));
+}]>;
+
// ptest instruction we'll lower to this in X86ISelLowering primarily from
// the intel intrinsic that corresponds to this.
let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -5723,6 +5730,17 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
}
+let Predicates = [HasAVX] in {
+ def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1),
+ (VPTESTrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1),
+ (VPTESTYrm VR256:$src1, addr:$src2)>;
+}
+let Predicates = [UseSSE41] in {
+ def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1),
+ (PTESTrm VR128:$src1, addr:$src2)>;
+}
+
// The bit test instructions below are AVX only
multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index f8ba00b0332994..9cd0f4d12e15ab 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1018,32 +1018,38 @@ define zeroext i1 @PR44781(ptr %0) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: PR44781:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; AVX1OR2-NEXT: sete %al
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: PR44781:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [15,15,15,15]
+; AVX1-NEXT: vptest (%rdi), %xmm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR44781:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX2-NEXT: vptest (%rdi), %xmm0
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: PR44781:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512F-NEXT: vptest (%rdi), %xmm0
; AVX512F-NEXT: sete %al
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: PR44781:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512BW-NEXT: vptest (%rdi), %xmm0
; AVX512BW-NEXT: sete %al
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: PR44781:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455]
-; AVX512BWVL-NEXT: vptest %xmm1, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455]
+; AVX512BWVL-NEXT: vptest (%rdi), %xmm0
; AVX512BWVL-NEXT: sete %al
; AVX512BWVL-NEXT: retq
%2 = load <4 x i32>, ptr %0, align 4
|
I guess this still needs SSE 4.1 coverage. @phoebewang do you want to pick this up? |
Done by 17b86d5, please rebase. |
145b1bc
to
3eeb6ca
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No objections from me - technically we're still missing the ability to fold some loads that appear later (from stack spills etc.) but short of making a full pseudo instruction I'm not sure whether we can do it cleanly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also - what about TESTPS/TESTPD?
llvm/lib/Target/X86/X86InstrSSE.td
Outdated
@@ -5737,6 +5755,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, | |||
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; | |||
} | |||
|
|||
// ptest is commutable if only the Z flag is used. If the C flag is used, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
testp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd spell it testps/testpd
or testpX
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Fixes #88958.