Skip to content

Commit

Permalink
[DAG] SimplifyMultipleUseDemandedBits - remove superfluous bitcasts
Browse files Browse the repository at this point in the history
If the SimplifyMultipleUseDemandedBits calls BITCASTs that peek through back to the original type then we can remove the BITCASTs entirely.

Differential Revision: https://reviews.llvm.org/D79572
  • Loading branch information
RKSimon committed May 8, 2020
1 parent d26a8da commit 70293ba
Show file tree
Hide file tree
Showing 11 changed files with 457 additions and 494 deletions.
4 changes: 3 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Expand Up @@ -617,9 +617,11 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
SDValue Src = peekThroughBitcasts(Op.getOperand(0));
EVT SrcVT = Src.getValueType();
EVT DstVT = Op.getValueType();
if (SrcVT == DstVT)
return Src;

unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
unsigned NumDstEltBits = DstVT.getScalarSizeInBits();

if (NumSrcEltBits == NumDstEltBits)
if (SDValue V = SimplifyMultipleUseDemandedBits(
Src, DemandedBits, DemandedElts, DAG, Depth + 1))
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Expand Up @@ -782,17 +782,16 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; VI-NEXT: v_and_b32_e32 v7, 15, v5
; VI-NEXT: v_lshrrev_b16_e32 v8, v7, v3
; VI-NEXT: v_sub_u16_e32 v7, 16, v7
; VI-NEXT: s_mov_b32 s4, 0xf000f
; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
; VI-NEXT: v_and_b32_e32 v5, s4, v5
; VI-NEXT: v_and_b32_e32 v5, 15, v5
; VI-NEXT: v_or_b32_e32 v1, v1, v8
; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; VI-NEXT: v_and_b32_e32 v3, 15, v4
; VI-NEXT: v_lshrrev_b16_e32 v5, v3, v2
; VI-NEXT: v_sub_u16_e32 v3, 16, v3
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; VI-NEXT: v_and_b32_e32 v3, s4, v4
; VI-NEXT: v_and_b32_e32 v3, 0xf000f, v4
; VI-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
Expand Down
13 changes: 6 additions & 7 deletions llvm/test/CodeGen/Thumb2/lsll0.ll
Expand Up @@ -5,17 +5,16 @@ define void @_Z4loopPxS_iS_i(i64* %d) {
; CHECK-LABEL: _Z4loopPxS_iS_i:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: sxth r1, r1
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: asrs r3, r1, #31
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: strd r1, r3, [r0]
; CHECK-NEXT: strd r2, r12, [r0, #8]
; CHECK-NEXT: asr.w r12, r1, #31
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: strd r2, r3, [r0]
; CHECK-NEXT: strd r1, r12, [r0, #8]
; CHECK-NEXT: bx lr
entry:
%wide.load = load <2 x i64>, <2 x i64>* undef, align 8
Expand Down
29 changes: 13 additions & 16 deletions llvm/test/CodeGen/Thumb2/mve-vld3.ll
Expand Up @@ -6,26 +6,23 @@
define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
; CHECK-LABEL: vld3_v2i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r2, r3, [r0, #16]
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: ldrd r12, r3, [r0, #16]
; CHECK-NEXT: vmov.32 r0, q0[1]
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vdup.32 q1, r0
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov.f64 d2, d0
; CHECK-NEXT: vmov.32 q2[2], r3
; CHECK-NEXT: vmov.32 r0, q0[2]
; CHECK-NEXT: vmov.f32 s12, s1
; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov.f32 s14, s8
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r12, s6
; CHECK-NEXT: vdup.32 q1, r0
; CHECK-NEXT: vmov r0, s14
; CHECK-NEXT: add r0, r12
; CHECK-NEXT: add r0, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.32 r2, q0[2]
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: add r2, r12
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: strd r2, r0, [r1]
; CHECK-NEXT: strd r0, r2, [r1]
; CHECK-NEXT: bx lr
entry:
%l1 = load <6 x i32>, <6 x i32>* %src, align 4
Expand Down
45 changes: 10 additions & 35 deletions llvm/test/CodeGen/X86/bitcast-vector-bool.ll
Expand Up @@ -50,14 +50,9 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
; SSE2-SSSE3-NEXT: shrb $2, %cl
; SSE2-SSSE3-NEXT: movzbl %cl, %ecx
; SSE2-SSSE3-NEXT: andb $3, %al
; SSE2-SSSE3-NEXT: movzbl %al, %eax
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb %cl, %al
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_v4i32_to_v2i2:
Expand Down Expand Up @@ -96,14 +91,9 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
; SSE2-SSSE3-NEXT: shrb $4, %cl
; SSE2-SSSE3-NEXT: movzbl %cl, %ecx
; SSE2-SSSE3-NEXT: andb $15, %al
; SSE2-SSSE3-NEXT: movzbl %al, %eax
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb %cl, %al
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_v8i16_to_v2i4:
Expand Down Expand Up @@ -183,14 +173,9 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
; SSE2-SSSE3-NEXT: shrb $2, %cl
; SSE2-SSSE3-NEXT: movzbl %cl, %ecx
; SSE2-SSSE3-NEXT: andb $3, %al
; SSE2-SSSE3-NEXT: movzbl %al, %eax
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb %cl, %al
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_v4i64_to_v2i2:
Expand Down Expand Up @@ -232,14 +217,9 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
; SSE2-SSSE3-NEXT: shrb $4, %cl
; SSE2-SSSE3-NEXT: movzbl %cl, %ecx
; SSE2-SSSE3-NEXT: andb $15, %al
; SSE2-SSSE3-NEXT: movzbl %al, %eax
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb %cl, %al
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_v8i32_to_v2i4:
Expand Down Expand Up @@ -395,14 +375,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
; SSE2-SSSE3-NEXT: shrb $4, %cl
; SSE2-SSSE3-NEXT: movzbl %cl, %ecx
; SSE2-SSSE3-NEXT: andb $15, %al
; SSE2-SSSE3-NEXT: movzbl %al, %eax
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: addb %cl, %al
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: bitcast_v8i64_to_v2i4:
Expand Down
60 changes: 30 additions & 30 deletions llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
Expand Up @@ -577,25 +577,25 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: cltd
; X86-NEXT: idivl %esi
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm5, %eax
; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
; X86-NEXT: movd %xmm5, %esi
; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm4, %eax
; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
; X86-NEXT: movd %xmm4, %esi
; X86-NEXT: cltd
; X86-NEXT: idivl %esi
; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; X86-NEXT: movdqa %xmm4, (%ecx)
; X86-NEXT: pmuludq %xmm1, %xmm4
; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X86-NEXT: movdqa %xmm2, (%ecx)
; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-NEXT: pmuludq %xmm5, %xmm1
; X86-NEXT: pmuludq %xmm3, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X86-NEXT: psubd %xmm3, %xmm0
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT: psubd %xmm2, %xmm0
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
Expand All @@ -620,25 +620,25 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X64-NEXT: movd %xmm1, %ecx
; X64-NEXT: cltd
; X64-NEXT: idivl %ecx
; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm5, %eax
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
; X64-NEXT: movd %xmm5, %ecx
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm4, %eax
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
; X64-NEXT: movd %xmm4, %ecx
; X64-NEXT: cltd
; X64-NEXT: idivl %ecx
; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; X64-NEXT: movdqa %xmm4, (%rdi)
; X64-NEXT: pmuludq %xmm1, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X64-NEXT: movdqa %xmm2, (%rdi)
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-NEXT: pmuludq %xmm5, %xmm1
; X64-NEXT: pmuludq %xmm3, %xmm1
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X64-NEXT: psubd %xmm3, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: psubd %xmm2, %xmm0
; X64-NEXT: retq
%div = sdiv <4 x i32> %x, %y
store <4 x i32> %div, <4 x i32>* %divdst, align 16
Expand Down
60 changes: 30 additions & 30 deletions llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
Expand Up @@ -577,25 +577,25 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm5, %eax
; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
; X86-NEXT: movd %xmm5, %esi
; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm4, %eax
; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
; X86-NEXT: movd %xmm4, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; X86-NEXT: movdqa %xmm4, (%ecx)
; X86-NEXT: pmuludq %xmm1, %xmm4
; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X86-NEXT: movdqa %xmm2, (%ecx)
; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-NEXT: pmuludq %xmm5, %xmm1
; X86-NEXT: pmuludq %xmm3, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X86-NEXT: psubd %xmm3, %xmm0
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT: psubd %xmm2, %xmm0
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
Expand All @@ -620,25 +620,25 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X64-NEXT: movd %xmm1, %ecx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm5, %eax
; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
; X64-NEXT: movd %xmm5, %ecx
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm4, %eax
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
; X64-NEXT: movd %xmm4, %ecx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; X64-NEXT: movdqa %xmm4, (%rdi)
; X64-NEXT: pmuludq %xmm1, %xmm4
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X64-NEXT: movdqa %xmm2, (%rdi)
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-NEXT: pmuludq %xmm5, %xmm1
; X64-NEXT: pmuludq %xmm3, %xmm1
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X64-NEXT: psubd %xmm3, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: psubd %xmm2, %xmm0
; X64-NEXT: retq
%div = udiv <4 x i32> %x, %y
store <4 x i32> %div, <4 x i32>* %divdst, align 16
Expand Down

0 comments on commit 70293ba

Please sign in to comment.