-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86][AVX512] rematerialize smaller predicate masks #166178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Ahmed Nour (ahmednoursphinx) ChangesResolves #165752 Full diff: https://github.com/llvm/llvm-project/pull/166178.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..9fae602974242 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+ defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+ defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
let Predicates = [HasAVX512] in {
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3179,25 @@ let Predicates = [HasAVX512] in {
def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
}
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+ def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>;
+ def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>;
+}
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>;
+ def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>;
+}
+
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..3eadac4f827bc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -789,9 +789,11 @@ bool X86InstrInfo::isReMaterializableImpl(
case X86::FsFLD0SS:
case X86::FsFLD0SH:
case X86::FsFLD0F128:
+ case X86::KSET0B:
case X86::KSET0D:
case X86::KSET0Q:
case X86::KSET0W:
+ case X86::KSET1B:
case X86::KSET1D:
case X86::KSET1Q:
case X86::KSET1W:
@@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// registers, since it is not usable as a write mask.
// FIXME: A more advanced approach would be to choose the best input mask
// register based on context.
+ case X86::KSET0B:
+ return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
case X86::KSET0W:
return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
case X86::KSET0D:
return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
case X86::KSET0Q:
return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+ case X86::KSET1B:
+ return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
case X86::KSET1W:
return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
case X86::KSET1D:
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000000000..6a1a0af05d05c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+
+; Test case 1: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_all:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kxnorw %k0, %k0, %k1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: gather_all:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: gather_all:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: gather_all:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1
+; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef)
+ ret <16 x float> %res
+}
+
+; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef)
+ ret <16 x float> %res
+}
+
+
|
|
✅ With the latest revision this PR passed the undef deprecator. |
This reverts commit 25a8351.
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
still missing v32i1 / v64i1 sub mask test coverage
Sorry missed that , added in latest commit |
|
Hey @RKSimon @phoebewang PR is ready for review again when you have time |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These need regenerating with update_llc_test_checks.py:
Failed Tests (5):
LLVM :: CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
LLVM :: CodeGen/X86/avx512-gather-scatter-intrin.ll
LLVM :: CodeGen/X86/masked_gather_scatter.ll
LLVM :: CodeGen/X86/scatter-schedule.ll
LLVM :: CodeGen/X86/vector-replicaton-i1-mask.ll
|
Hey @RKSimon tests have been updated please check when you have time |
|
Hey @RKSimon please check again when you have time |
|
Hey @RKSimon this PR ready for review again when you have time |
|
Hey @RKSimon can you review this again please when you have some time |
|
Hey @RKSimon addressed your feedback please check when you have time |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - @phoebewang ?
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
Hey @RKSimon , @phoebewang can you please merge this PR |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/20834 Here is the relevant piece of the build log for the reference |
Resolves #165752