-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86][AVX512] Add pseudos for AVX512_*_SETALLONES
#169009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Introduce AVX512_128_SETALLONES, AVX512_256_SETALLONES pseudos to generate all-ones vectors. Post-RA expansion: Use VEX vpcmpeqd for XMM/YMM0–15 when available (matches current codegen as `AVX512_128/256_SETALLONES` will be preferred over `AVX1/2_SETALLONES` for AVX512VL target). Use EVEX `vpternlogd imm=0xFF` for high regs. Includes MIR tests for both VEX and EVEX paths.
|
@llvm/pr-subscribers-backend-x86 Author: Abhishek Kaushik (abhishek-kaushik22) ChangesIntroduce Post-RA expansion:
Includes MIR tests for both VEX and EVEX paths. Full diff: https://github.com/llvm/llvm-project/pull/169009.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..efb9f1309a528 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllZerosV))]>;
def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
+let AddedComplexity = 1, Predicates = [HasVLX] in {
+ def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins),
+ "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>;
+ def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins),
+ "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>;
+}
}
let Predicates = [HasAVX512] in {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5c23f917d0530..3136ad36ca5cb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -779,6 +779,8 @@ bool X86InstrInfo::isReMaterializableImpl(
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES:
case X86::AVX512_FsFLD0SD:
case X86::AVX512_FsFLD0SH:
@@ -6253,9 +6255,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
return true;
}
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES: {
Register Reg = MIB.getReg(0);
- MIB->setDesc(get(X86::VPTERNLOGDZrri));
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::AVX512_128_SETALLONES: {
+ if (X86::VR128RegClass.contains(Reg))
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
+
+ Opc = X86::VPTERNLOGDZ128rri;
+ break;
+ }
+ case X86::AVX512_256_SETALLONES: {
+ if (X86::VR256RegClass.contains(Reg))
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+
+ Opc = X86::VPTERNLOGDZ256rri;
+ break;
+ }
+ case X86::AVX512_512_SETALLONES:
+ Opc = X86::VPTERNLOGDZrri;
+ break;
+ }
+ MIB->setDesc(get(Opc));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
MIB.addReg(Reg, RegState::Undef)
@@ -8194,6 +8218,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
+ case X86::AVX512_256_SETALLONES:
Alignment = Align(32);
break;
case X86::V_SET0:
@@ -8201,6 +8226,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
case X86::FsFLD0F128:
case X86::AVX512_FsFLD0F128:
+ case X86::AVX512_128_SETALLONES:
Alignment = Align(16);
break;
case X86::MMX_SET0:
@@ -8259,6 +8285,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
@@ -8319,6 +8347,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
break;
case X86::AVX1_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
IsAllOnes = true;
[[fallthrough]];
case X86::AVX512_256_SET0:
@@ -8332,6 +8361,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
2);
break;
case X86::V_SETALLONES:
+ case X86::AVX512_128_SETALLONES:
IsAllOnes = true;
[[fallthrough]];
case X86::V_SET0:
diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
new file mode 100644
index 0000000000000..7e5ddc4cd632f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+ target triple = "x86_64-unknown-unknown"
+
+ define void @setallones() #0 {
+ ; CHECK-LABEL: setallones:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vpcmpeqd %xmm14, %xmm14, %xmm14
+ ; CHECK-NEXT: vpternlogd {{.*#+}} xmm16 = -1
+ ; CHECK-NEXT: vpcmpeqd %ymm15, %ymm15, %ymm15
+ ; CHECK-NEXT: vpternlogd {{.*#+}} ymm17 = -1
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name: setallones
+tracksRegLiveness: true
+liveins: []
+body: |
+ bb.0:
+ $xmm14 = AVX512_128_SETALLONES
+ $xmm16 = AVX512_128_SETALLONES
+ $ymm15 = AVX512_256_SETALLONES
+ $ymm17 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 3243d950740ca..e2400fbe2c4ff 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
; AVX512: # %bb.0:
; AVX512-NEXT: subq $24, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 32
-; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq use.v4.i32@PLT
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
|
🐧 Linux x64 Test Results
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero]
Why AddedComplexity = 1 needed here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without the AddedComplexity AVX2_SETALLONES is selected during isel instead of this one
|
Something we're struggling with is duplicated all ones/zeros expansions for different widths, I'm worried this could make this even worse if we don't address it. |
The motivation behind this patch is this crash during regalloc https://godbolt.org/z/ern6PxqKj |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Introduce `AVX512_128_SETALLONES`, `AVX512_256_SETALLONES` pseudos to generate all-ones vectors. Post-RA expansion: - Use VEX vpcmpeqd for XMM/YMM0–15 when available (matches current codegen as `AVX512_128/256_SETALLONES` will be preferred over `AVX1/2_SETALLONES` for AVX512VL target). - Use EVEX `vpternlogd imm=0xFF` for high regs. Includes MIR tests for both VEX and EVEX paths.
Introduce `AVX512_128_SETALLONES`, `AVX512_256_SETALLONES` pseudos to generate all-ones vectors. Post-RA expansion: - Use VEX vpcmpeqd for XMM/YMM0–15 when available (matches current codegen as `AVX512_128/256_SETALLONES` will be preferred over `AVX1/2_SETALLONES` for AVX512VL target). - Use EVEX `vpternlogd imm=0xFF` for high regs. Includes MIR tests for both VEX and EVEX paths.
Introduce
AVX512_128_SETALLONES,AVX512_256_SETALLONESpseudos to generate all-ones vectors.Post-RA expansion:
AVX512_128/256_SETALLONESwill be preferred overAVX1/2_SETALLONESfor AVX512VL target).vpternlogd imm=0xFFfor high regs.Includes MIR tests for both VEX and EVEX paths.