Skip to content

Commit

Permalink
[SDAG] Recursively legalize both vector mulo results
Browse files Browse the repository at this point in the history
Split out from D61692 per RKSimon's suggestion. Vector op
legalization will automatically recursively legalize the returned
SDValue, but we need to take care of the other results ourselves.
Otherwise it will end up getting legalized only during op
legalization, by which point it might be too late (though I'm not
aware of any specific cases right now).

There are codegen differences because expansion occurs earlier now
and we don't get a DAGCombiner run in between.

Differential Revision: https://reviews.llvm.org/D61744

llvm-svn: 360470
  • Loading branch information
nikic authored and MrSidims committed May 24, 2019
1 parent 7747ea4 commit dec7181
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 261 deletions.
10 changes: 7 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
Expand Up @@ -1253,9 +1253,13 @@ SDValue VectorLegalizer::ExpandMULO(SDValue Op) {
if (!TLI.expandMULO(Op.getNode(), Result, Overflow, DAG))
std::tie(Result, Overflow) = DAG.UnrollVectorOverflowOp(Op.getNode());

AddLegalizedOperand(Op.getValue(0), Result);
AddLegalizedOperand(Op.getValue(1), Overflow);
return Op.getResNo() ? Overflow : Result;
if (Op.getResNo() == 0) {
AddLegalizedOperand(Op.getValue(1), LegalizeOp(Overflow));
return Result;
} else {
AddLegalizedOperand(Op.getValue(0), LegalizeOp(Result));
return Overflow;
}
}

SDValue VectorLegalizer::ExpandAddSubSat(SDValue Op) {
Expand Down
20 changes: 12 additions & 8 deletions llvm/test/CodeGen/AArch64/vec_umulo.ll
Expand Up @@ -307,18 +307,22 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: xtn v1.4h, v0.4s
; CHECK-NEXT: umov w9, v1.h[1]
; CHECK-NEXT: umov w8, v1.h[0]
; CHECK-NEXT: shrn v1.4h, v0.4s, #16
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: umov w9, v0.h[1]
; CHECK-NEXT: umov w8, v0.h[0]
; CHECK-NEXT: and w9, w9, #0x1
; CHECK-NEXT: bfi w8, w9, #1, #1
; CHECK-NEXT: umov w9, v1.h[2]
; CHECK-NEXT: ushr v0.4h, v1.4h, #1
; CHECK-NEXT: umov w9, v0.h[2]
; CHECK-NEXT: cmeq v1.4h, v1.4h, #0
; CHECK-NEXT: ushr v2.4h, v0.4h, #1
; CHECK-NEXT: and w9, w9, #0x1
; CHECK-NEXT: cmeq v0.4h, v0.4h, #0
; CHECK-NEXT: bfi w8, w9, #2, #1
; CHECK-NEXT: umov w9, v1.h[3]
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: umov w9, v0.h[3]
; CHECK-NEXT: mvn v0.8b, v1.8b
; CHECK-NEXT: cmeq v1.4h, v2.4h, #0
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: orr v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bfi w8, w9, #3, #29
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: and w8, w8, #0xf
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/mulo-pow2.ll
Expand Up @@ -45,10 +45,10 @@ define <4 x i32> @umul_v4i32_2(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @umul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: umul_v4i32_8:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $3, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpslld $3, %xmm0, %xmm2
; AVX-NEXT: vpsrld $3, %xmm2, %xmm3
; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
Expand All @@ -60,10 +60,10 @@ define <4 x i32> @umul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @umul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: umul_v4i32_2pow31:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm2
; AVX-NEXT: vpsrld $31, %xmm2, %xmm3
; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
Expand Down Expand Up @@ -133,10 +133,10 @@ define <4 x i32> @smul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @smul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_2pow31:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm2
; AVX-NEXT: vpsrld $31, %xmm2, %xmm3
; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
Expand Down
130 changes: 64 additions & 66 deletions llvm/test/CodeGen/X86/vec_smulo.ll
Expand Up @@ -708,9 +708,9 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; SSE41-NEXT: pinsrd $1, %edx, %xmm3
; SSE41-NEXT: pinsrd $2, %ecx, %xmm3
; SSE41-NEXT: pinsrd $3, %r8d, %xmm3
; SSE41-NEXT: movd %r9d, %xmm0
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd %r9d, %xmm1
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
Expand All @@ -730,11 +730,11 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
; SSE41-NEXT: pxor %xmm3, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE41-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSE41-NEXT: pmuldq %xmm4, %xmm5
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pmuldq %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: pmuldq %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
; SSE41-NEXT: pmulld %xmm0, %xmm1
Expand Down Expand Up @@ -1873,53 +1873,52 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: psrad $8, %xmm0
; SSE2-NEXT: pslld $8, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: psubd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: psubd %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pslld $8, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[3,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: movd %xmm2, %ecx
; SSE2-NEXT: movw %cx, 3(%rdi)
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: movb %al, 2(%rdi)
; SSE2-NEXT: movd %xmm4, %eax
; SSE2-NEXT: shrl $16, %ecx
; SSE2-NEXT: movb %cl, 5(%rdi)
; SSE2-NEXT: movd %xmm5, %eax
; SSE2-NEXT: movw %ax, 9(%rdi)
; SSE2-NEXT: movd %xmm5, %ecx
; SSE2-NEXT: movd %xmm6, %ecx
; SSE2-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: movd %xmm6, %edx
; SSE2-NEXT: movw %dx, 3(%rdi)
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: movb %al, 11(%rdi)
; SSE2-NEXT: shrl $16, %ecx
; SSE2-NEXT: movb %cl, 8(%rdi)
; SSE2-NEXT: shrl $16, %edx
; SSE2-NEXT: movb %dl, 5(%rdi)
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
Expand All @@ -1929,53 +1928,52 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: psrad $8, %xmm0
; SSSE3-NEXT: pslld $8, %xmm1
; SSSE3-NEXT: psrad $8, %xmm1
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm1, %xmm3
; SSSE3-NEXT: paddd %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm1, %xmm4
; SSSE3-NEXT: paddd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSSE3-NEXT: psubd %xmm3, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: psubd %xmm4, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pslld $8, %xmm1
; SSSE3-NEXT: psrad $8, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[3,1,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSSE3-NEXT: pxor %xmm3, %xmm4
; SSSE3-NEXT: pxor %xmm3, %xmm1
; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: movw %cx, 3(%rdi)
; SSSE3-NEXT: shrl $16, %eax
; SSSE3-NEXT: movb %al, 2(%rdi)
; SSSE3-NEXT: movd %xmm4, %eax
; SSSE3-NEXT: shrl $16, %ecx
; SSSE3-NEXT: movb %cl, 5(%rdi)
; SSSE3-NEXT: movd %xmm5, %eax
; SSSE3-NEXT: movw %ax, 9(%rdi)
; SSSE3-NEXT: movd %xmm5, %ecx
; SSSE3-NEXT: movd %xmm6, %ecx
; SSSE3-NEXT: movw %cx, 6(%rdi)
; SSSE3-NEXT: movd %xmm6, %edx
; SSSE3-NEXT: movw %dx, 3(%rdi)
; SSSE3-NEXT: shrl $16, %eax
; SSSE3-NEXT: movb %al, 11(%rdi)
; SSSE3-NEXT: shrl $16, %ecx
; SSSE3-NEXT: movb %cl, 8(%rdi)
; SSSE3-NEXT: shrl $16, %edx
; SSSE3-NEXT: movb %dl, 5(%rdi)
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
Expand Down

0 comments on commit dec7181

Please sign in to comment.