diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 70564973816b1..e8fda829e2394 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllZerosV))]>; def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; +let AddedComplexity = 1, Predicates = [HasVLX] in { + def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins), + "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>; + def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins), + "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>; +} } let Predicates = [HasAVX512] in { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index cb0208a4a5f32..b988ae0aca912 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -778,6 +778,8 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0SH: @@ -6246,9 +6248,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: { Register Reg = MIB.getReg(0); - MIB->setDesc(get(X86::VPTERNLOGDZrri)); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::AVX512_128_SETALLONES: { + if (X86::VR128RegClass.contains(Reg)) + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr)); + + Opc = X86::VPTERNLOGDZ128rri; + break; + } + case X86::AVX512_256_SETALLONES: { + if (X86::VR256RegClass.contains(Reg)) + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + + Opc = X86::VPTERNLOGDZ256rri; + break; + } + case X86::AVX512_512_SETALLONES: + Opc = X86::VPTERNLOGDZrri; + break; + } + MIB->setDesc(get(Opc)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. MIB.addReg(Reg, RegState::Undef) @@ -8190,6 +8214,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: + case X86::AVX512_256_SETALLONES: Alignment = Align(32); break; case X86::V_SET0: @@ -8197,6 +8222,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_128_SET0: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: + case X86::AVX512_128_SETALLONES: Alignment = Align(16); break; case X86::MMX_SET0: @@ -8255,6 +8281,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: + case X86::AVX512_128_SETALLONES: + case X86::AVX512_256_SETALLONES: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: @@ -8315,6 +8343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( break; case X86::AVX1_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX512_256_SETALLONES: IsAllOnes = true; [[fallthrough]]; case X86::AVX512_256_SET0: @@ -8328,6 +8357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( 2); break; case X86::V_SETALLONES: + case X86::AVX512_128_SETALLONES: IsAllOnes = true; [[fallthrough]]; case X86::V_SET0: diff --git a/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir new file mode 100644 index 0000000000000..0d8f2177aaa30 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir @@ -0,0 +1,26 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +# RUN: llc %s -mtriple=i386-- -start-before=postrapseudos -o - | FileCheck %s + +--- | + target triple = "i386-unknown-unknown" + + define void @setallones() #0 { + ; CHECK-LABEL: setallones: + ; CHECK: # %bb.0: + ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 + ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 + entry: + unreachable + } + + attributes #0 = { "target-features"="+avx512f,+avx512vl" } +--- +name: setallones +tracksRegLiveness: true +liveins: [] +body: | + bb.0: + $xmm0 = AVX512_128_SETALLONES + $ymm1 = AVX512_256_SETALLONES + +... diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir new file mode 100644 index 0000000000000..7e5ddc4cd632f --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir @@ -0,0 +1,30 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s + +--- | + target triple = "x86_64-unknown-unknown" + + define void @setallones() #0 { + ; CHECK-LABEL: setallones: + ; CHECK: # %bb.0: + ; CHECK-NEXT: vpcmpeqd %xmm14, %xmm14, %xmm14 + ; CHECK-NEXT: vpternlogd {{.*#+}} xmm16 = -1 + ; CHECK-NEXT: vpcmpeqd %ymm15, %ymm15, %ymm15 + ; CHECK-NEXT: vpternlogd {{.*#+}} ymm17 = -1 + entry: + unreachable + } + + attributes #0 = { "target-features"="+avx512f,+avx512vl" } +--- +name: setallones +tracksRegLiveness: true +liveins: [] +body: | + bb.0: + $xmm14 = AVX512_128_SETALLONES + $xmm16 = AVX512_128_SETALLONES + $ymm15 = AVX512_256_SETALLONES + $ymm17 = AVX512_256_SETALLONES + +... diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll index 3243d950740ca..e2400fbe2c4ff 100644 --- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll +++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll @@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: subq $24, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 32 -; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq use.v4.i32@PLT ; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload