-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LegalizeTypes][X86][PowerPC] Use shift by 1 instead of adding a value to itself to double. #86857
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-x86 Author: Craig Topper (topperc) ChangesUsing a shift is the correct way to handle undef and works better with our optimizations that move freeze around. The X86 code looks like an improvment, but PowerPC might be a regression. Hoping this improves some code for #86850. Full diff: https://github.com/llvm/llvm-project/pull/86857.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a55364ea2c4e5b..73e4b50e316a90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -428,7 +428,8 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
std::swap(Lo, Hi);
SDValue Idx = N->getOperand(2);
- Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+ Idx = DAG.getNode(ISD::SHL, dl, Idx.getValueType(), Idx,
+ DAG.getShiftAmountConstant(1, Idx.getValueType(), dl));
NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
Idx = DAG.getNode(ISD::ADD, dl,
Idx.getValueType(), Idx,
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
index aae23265710ce0..cb21efc5a35a85 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
@@ -165,15 +165,16 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; CHECK-32-LABEL: testDoubleword:
; CHECK-32: # %bb.0: # %entry
-; CHECK-32-NEXT: add 5, 6, 6
; CHECK-32-NEXT: addi 7, 1, -32
+; CHECK-32-NEXT: rlwinm 5, 6, 3, 28, 28
; CHECK-32-NEXT: stxv 34, -32(1)
-; CHECK-32-NEXT: rlwinm 6, 5, 2, 28, 29
-; CHECK-32-NEXT: stwx 3, 7, 6
-; CHECK-32-NEXT: addi 3, 5, 1
-; CHECK-32-NEXT: addi 5, 1, -16
+; CHECK-32-NEXT: stwx 3, 7, 5
+; CHECK-32-NEXT: slwi 3, 6, 1
+; CHECK-32-NEXT: li 5, 1
; CHECK-32-NEXT: lxv 0, -32(1)
-; CHECK-32-NEXT: rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT: rlwimi 5, 3, 0, 0, 30
+; CHECK-32-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-32-NEXT: addi 5, 1, -16
; CHECK-32-NEXT: stxv 0, -16(1)
; CHECK-32-NEXT: stwx 4, 5, 3
; CHECK-32-NEXT: lxv 34, -16(1)
@@ -187,10 +188,11 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; CHECK-32-P10-LABEL: testDoubleword:
; CHECK-32-P10: # %bb.0: # %entry
-; CHECK-32-P10-NEXT: add 5, 6, 6
-; CHECK-32-P10-NEXT: slwi 6, 5, 2
+; CHECK-32-P10-NEXT: slwi 5, 6, 1
+; CHECK-32-P10-NEXT: slwi 6, 6, 3
; CHECK-32-P10-NEXT: vinswlx 2, 6, 3
-; CHECK-32-P10-NEXT: addi 3, 5, 1
+; CHECK-32-P10-NEXT: li 3, 1
+; CHECK-32-P10-NEXT: rlwimi 3, 5, 0, 0, 30
; CHECK-32-P10-NEXT: slwi 3, 3, 2
; CHECK-32-P10-NEXT: vinswlx 2, 3, 4
; CHECK-32-P10-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
index b98aed8616509e..92dbb7e6c4b0ff 100644
--- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -241,15 +241,16 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; AIX-P8-32-LABEL: testDoubleword:
; AIX-P8-32: # %bb.0: # %entry
-; AIX-P8-32-NEXT: add r6, r6, r6
; AIX-P8-32-NEXT: addi r5, r1, -32
-; AIX-P8-32-NEXT: rlwinm r7, r6, 2, 28, 29
+; AIX-P8-32-NEXT: rlwinm r7, r6, 3, 28, 28
; AIX-P8-32-NEXT: stxvw4x v2, 0, r5
; AIX-P8-32-NEXT: stwx r3, r5, r7
; AIX-P8-32-NEXT: addi r3, r1, -16
; AIX-P8-32-NEXT: lxvw4x vs0, 0, r5
-; AIX-P8-32-NEXT: addi r5, r6, 1
-; AIX-P8-32-NEXT: rlwinm r5, r5, 2, 28, 29
+; AIX-P8-32-NEXT: slwi r5, r6, 1
+; AIX-P8-32-NEXT: li r6, 1
+; AIX-P8-32-NEXT: rlwimi r6, r5, 0, 0, 30
+; AIX-P8-32-NEXT: rlwinm r5, r6, 2, 28, 29
; AIX-P8-32-NEXT: stxvw4x vs0, 0, r3
; AIX-P8-32-NEXT: stwx r4, r3, r5
; AIX-P8-32-NEXT: lxvw4x v2, 0, r3
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index 8ed8495d7a4614..3f4ee5c64a8abb 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -1013,14 +1013,13 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
; X86AVX2-NEXT: movl 12(%ebp), %ecx
; X86AVX2-NEXT: movl 16(%ebp), %edx
; X86AVX2-NEXT: vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT: leal (%edx,%edx), %esi
-; X86AVX2-NEXT: andl $3, %esi
-; X86AVX2-NEXT: movl %eax, (%esp,%esi,4)
+; X86AVX2-NEXT: leal 1(%edx,%edx), %esi
+; X86AVX2-NEXT: andl $1, %edx
+; X86AVX2-NEXT: movl %eax, (%esp,%edx,8)
; X86AVX2-NEXT: vmovaps (%esp), %xmm0
; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT: leal 1(%edx,%edx), %eax
-; X86AVX2-NEXT: andl $3, %eax
-; X86AVX2-NEXT: movl %ecx, 16(%esp,%eax,4)
+; X86AVX2-NEXT: andl $3, %esi
+; X86AVX2-NEXT: movl %ecx, 16(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -1362,14 +1361,13 @@ define <2 x i64> @load_i64_v2i64(<2 x i64> %v, ptr %p, i32 %y) nounwind {
; X86AVX2-NEXT: movl (%ecx), %edx
; X86AVX2-NEXT: movl 4(%ecx), %ecx
; X86AVX2-NEXT: vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT: leal (%eax,%eax), %esi
-; X86AVX2-NEXT: andl $3, %esi
-; X86AVX2-NEXT: movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
+; X86AVX2-NEXT: andl $1, %eax
+; X86AVX2-NEXT: movl %edx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %xmm0
; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
-; X86AVX2-NEXT: andl $3, %eax
-; X86AVX2-NEXT: movl %ecx, 16(%esp,%eax,4)
+; X86AVX2-NEXT: andl $3, %esi
+; X86AVX2-NEXT: movl %ecx, 16(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -1746,14 +1744,13 @@ define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
; X86AVX2-NEXT: movl 12(%ebp), %ecx
; X86AVX2-NEXT: movl 16(%ebp), %edx
; X86AVX2-NEXT: vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT: leal (%edx,%edx), %esi
-; X86AVX2-NEXT: andl $7, %esi
-; X86AVX2-NEXT: movl %eax, (%esp,%esi,4)
+; X86AVX2-NEXT: leal 1(%edx,%edx), %esi
+; X86AVX2-NEXT: andl $3, %edx
+; X86AVX2-NEXT: movl %eax, (%esp,%edx,8)
; X86AVX2-NEXT: vmovaps (%esp), %ymm0
; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT: leal 1(%edx,%edx), %eax
-; X86AVX2-NEXT: andl $7, %eax
-; X86AVX2-NEXT: movl %ecx, 32(%esp,%eax,4)
+; X86AVX2-NEXT: andl $7, %esi
+; X86AVX2-NEXT: movl %ecx, 32(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -2128,14 +2125,13 @@ define <4 x i64> @load_i64_v4i64(<4 x i64> %v, ptr %p, i32 %y) nounwind {
; X86AVX2-NEXT: movl (%ecx), %edx
; X86AVX2-NEXT: movl 4(%ecx), %ecx
; X86AVX2-NEXT: vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT: leal (%eax,%eax), %esi
-; X86AVX2-NEXT: andl $7, %esi
-; X86AVX2-NEXT: movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
+; X86AVX2-NEXT: andl $3, %eax
+; X86AVX2-NEXT: movl %edx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %ymm0
; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
-; X86AVX2-NEXT: andl $7, %eax
-; X86AVX2-NEXT: movl %ecx, 32(%esp,%eax,4)
+; X86AVX2-NEXT: andl $7, %esi
+; X86AVX2-NEXT: movl %ecx, 32(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
|
If you really care about the x86 codegen for that testcase, there's a bigger issue here: we store/load from the stack twice for a single insertelement. (Or we could consider using a blend, like: |
And if you actually care about being undef-correct, I think you need to freeze the operand anyway: the splitting of the insert itself increases the number of uses of an undef value. |
…e to itself to double. Using a shift is the correct way to handle undef and works better with our optimizations that move freeze around. The X86 code looks like an improvment, but PowerPC might be a regression. Hoping this improves some code for llvm#86850.
0cf3c67
to
b19897a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we also do this as a standalone combine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change makes sense to me for PPC part.
Let X86 reviewer check X86 test changes before it is approved.
; CHECK-32-NEXT: addi 7, 1, -32 | ||
; CHECK-32-NEXT: slwi 5, 6, 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shows the advantage, after changing add to shift, on ppc,
- the shift can be combined with other shift, like rlwinm
- the shift lowest bit is 0, so ori can be used instead of addi(although this may not be an improvement, ori and addi have same latency)
; CHECK-32-P10-NEXT: vinswlx 2, 6, 3 | ||
; CHECK-32-P10-NEXT: addi 3, 5, 1 | ||
; CHECK-32-P10-NEXT: li 3, 1 | ||
; CHECK-32-P10-NEXT: rlwimi 3, 5, 0, 0, 30 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This exposes an combine opportunity on PPC.
li 3, 1
rlwimi 3, 5, 0, 0, 30
should be just:
addi 3, 5, 1 //ori 3, 5, 1
We will handle this later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
li 3, 1 should be a zero cycle operation on P10
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we do it in more generically.
slwi 5, 6, n
li 3, Val (if Val != 0,1,-1, li is not zero operation onP10
rlwimi 3, 5, 0, 0, K (K=31-n...31)
if 2^(31-n) >val
we can convert to
addi 3, 5, Val
or
ori 3, 5, Val
@@ -1019,7 +1019,7 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind { | |||
; X86AVX2-NEXT: movl %edx, (%esp,%esi,4) | |||
; X86AVX2-NEXT: vmovaps (%esp), %xmm0 | |||
; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) | |||
; X86AVX2-NEXT: incl %ecx | |||
; X86AVX2-NEXT: orl $1, %ecx |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why OR is better than INC? I see they have the same TP/ports but OR has one more byte longer and Lat seems longer too in some case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we're missing addlike matching in the inc patterns?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CC @goldsteinn who was working on something similar for #83691
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NB: that patch doesn't fix the issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sadly my or_is_add patch did not work either. This is the final DAG
SelectionDAG has 36 nodes:
t0: ch,glue = EntryToken
t2: v2i64,ch = CopyFromReg t0, Register:v2i64 %0
t52: ch = store<(store (s128) into %stack.1)> t0, t2, FrameIndex:i32<1>, undef:i32
t17: i32,ch = load<(load (s32) from %fixed-stack.2)> t0, FrameIndex:i32<-1>, undef:i32
t43: i32 = and t42, Constant:i32<3>
t51: i32 = shl t43, Constant:i8<2>
t48: i32 = or disjoint FrameIndex:i32<1>, t51
t46: ch = store<(store (s32))> t52, t17, t48, undef:i32
t47: v4i32,ch = load<(load (s128) from %stack.1)> t46, FrameIndex:i32<1>, undef:i32
t32: ch = store<(store (s128) into %stack.0)> t0, t47, FrameIndex:i32<0>, undef:i32
t19: i32 = add FrameIndex:i32<-1>, Constant:i32<4>
t20: i32,ch = load<(load (s32) from %fixed-stack.2 + 4)> t0, t19, undef:i32
t30: i32 = or t42, Constant:i32<1>
t35: i32 = and t30, Constant:i32<3>
t55: i32 = shl t35, Constant:i8<2>
t53: i32 = or disjoint FrameIndex:i32<0>, t55
t38: ch = store<(store (s32))> t32, t20, t53, undef:i32
t56: v2i64,ch = load<(load (s128) from %stack.0)> t38, FrameIndex:i32<0>, undef:i32
t14: ch,glue = CopyToReg t0, Register:v2i64 $xmm0, t56
t9: i32,ch = load<(load (s32) from %fixed-stack.0)> t0, FrameIndex:i32<-3>, undef:i32
t24: i32 = shl t9, Constant:i8<1>
t42: i32 = freeze t24
t15: ch = X86ISD::RET_GLUE t14, TargetConstant:i32<0>, Register:v2i64 $xmm0, t14:1
t30
is the or
. It uses the t42
freeze of t24
. We should have been able to hoist the freeze above the t24
since its the only user.
You can see there are 2 uses of t42. Both of those are nodes that a freeze
was hoisted above. After the first freeze
hoisted, the shl
was used by the hoisted freeze
and one of the other nodes so the freeze
wasn't the only user of the shl
. Then we did the second hoist and the second hoisted freeze
CSEd with the previous hoisted freeze
and decreased the use count of the shl
making the freeze
the only user. We did not revisit the freeze
after that.
Should fix the cases noted in #86857
Using a shift is the correct way to handle undef and works better with our optimizations that move freeze around.
The X86 code looks like an improvment, but PowerPC might be a regression.
Hoping this improves some code for #86850.