-
Notifications
You must be signed in to change notification settings - Fork 15.2k
release/21.x: [X86] Use pseudo instructions to zero registers in buildClearRegister (#163358)
#164076
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@RKSimon What do you think about merging this PR to the release branch? |
|
@llvm/pr-subscribers-backend-x86 Author: None (llvmbot) ChangesBackport 228dae7 Requested by: @RKSimon Full diff: https://github.com/llvm/llvm-project/pull/164076.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec39..9bf58dd3458cd 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10739,39 +10739,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- // PXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- // VPXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- // VPXORY is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- // KXOR is safe to use because it doesn't affect flags.
- unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
- BuildMI(MBB, Iter, DL, get(Op), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
+ BuildMI(MBB, Iter, DL, get(Op), Reg);
}
}
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
new file mode 100644
index 0000000000000..d9253e0ca127b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512bw -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512BW
+
+define void @zero_xmm(<4 x i32> %arg) #0 {
+; SSE-LABEL: zero_xmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_xmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_xmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ store <4 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_ymm(<8 x i32> %arg) #0 {
+; SSE-LABEL: zero_ymm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_ymm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %ymm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <8 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_zmm(<16 x i32> %arg) #0 {
+; SSE-LABEL: zero_zmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm3, 48
+; SSE-NEXT: movaps %xmm2, 32
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_zmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm1, 32
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_zmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups %zmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <16 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_k(<8 x i32> %arg, <8 x i1> %mask) #0 {
+; SSE-LABEL: zero_k:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: testb $1, %al
+; SSE-NEXT: jne .LBB3_1
+; SSE-NEXT: # %bb.2: # %else
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: jne .LBB3_3
+; SSE-NEXT: .LBB3_4: # %else2
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: jne .LBB3_5
+; SSE-NEXT: .LBB3_6: # %else4
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: jne .LBB3_7
+; SSE-NEXT: .LBB3_8: # %else6
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: jne .LBB3_9
+; SSE-NEXT: .LBB3_10: # %else8
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: jne .LBB3_11
+; SSE-NEXT: .LBB3_12: # %else10
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: jne .LBB3_13
+; SSE-NEXT: .LBB3_14: # %else12
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: je .LBB3_16
+; SSE-NEXT: .LBB3_15: # %cond.store13
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 28
+; SSE-NEXT: .LBB3_16: # %else14
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: retq
+; SSE-NEXT: .LBB3_1: # %cond.store
+; SSE-NEXT: movd %xmm0, 0
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: je .LBB3_4
+; SSE-NEXT: .LBB3_3: # %cond.store1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT: movd %xmm2, 4
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: je .LBB3_6
+; SSE-NEXT: .LBB3_5: # %cond.store3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT: movd %xmm2, 8
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: je .LBB3_8
+; SSE-NEXT: .LBB3_7: # %cond.store5
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 12
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: je .LBB3_10
+; SSE-NEXT: .LBB3_9: # %cond.store7
+; SSE-NEXT: movd %xmm1, 16
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: je .LBB3_12
+; SSE-NEXT: .LBB3_11: # %cond.store9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: movd %xmm0, 20
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: je .LBB3_14
+; SSE-NEXT: .LBB3_13: # %cond.store11
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: movd %xmm0, 24
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: jne .LBB3_15
+; SSE-NEXT: jmp .LBB3_16
+;
+; AVX1-LABEL: zero_k:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 0
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zero_k:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, 0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: zero_k:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: kxorw %k0, %k0, %k1
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: zero_k:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovw2m %xmm1, %k1
+; AVX512BW-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: kxorq %k0, %k0, %k1
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr null, i32 32, <8 x i1> %mask)
+ ret void
+}
+
+attributes #0 = { "zero-call-used-regs"="used" }
|
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - @phoebewang ?
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, too.
…r` (llvm#163358) In `buildClearRegister` use the correct pseudo-opcode for each register class: - For `VR128`, use `V_SET0` - For `VR256`, use `AVX_SET0` - For `VR512`, use `AVX512_512_SET0` - For `VK*`, use `KSET0Q/KSET0W` This avoids illegal register/opcode pairings and machine verifier errors when clearing call-used registers under `-fzero-call-used-regs=used`. Fixes: llvm#163053 --------- Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk> (cherry picked from commit 228dae7)
|
@RKSimon (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport 228dae7
Requested by: @RKSimon