From 3b04a4e322e442690bab05d4163ed40d7fd9f950 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 7 Oct 2018 11:45:46 +0000 Subject: [PATCH] [SelectionDAG] Respect multiple uses in SimplifyDemandedBits to SimplifyDemandedVectorElts simplification rL343913 was using SimplifyDemandedBits's original demanded mask instead of the adjusted 'NewMask' that accounts for multiple uses of the op (those variable names really need improving....). Annoyingly many of the test changes (back to pre-rL343913 state) are actually safe - but only because their multiple uses are all by PMULDQ/PMULUDQ. Thanks to Jan Vesely (@jvesely) for bisecting the bug. llvm-svn: 343935 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- .../CodeGen/X86/avx2-intrinsics-fast-isel.ll | 9 ++ .../X86/avx512-intrinsics-fast-isel.ll | 5 + llvm/test/CodeGen/X86/pmul.ll | 113 +++++++++++------- .../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 28 ++++- .../CodeGen/X86/sse41-intrinsics-fast-isel.ll | 17 ++- llvm/test/CodeGen/X86/vector-mul.ll | 56 +++++---- llvm/test/CodeGen/X86/vector-trunc-math.ll | 57 +++++---- 8 files changed, 183 insertions(+), 104 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 8b6b4520ad647..1c3c60b1a8582 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1219,7 +1219,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, DemandedSubElts = APInt::getNullValue(Scale); for (unsigned i = 0; i != Scale; ++i) { unsigned Offset = i * NumSrcEltBits; - APInt Sub = DemandedMask.extractBits(NumSrcEltBits, Offset); + APInt Sub = NewMask.extractBits(NumSrcEltBits, Offset); if (Sub.isAllOnesValue()) DemandedSubElts.setBit(i); else if (!Sub.isNullValue()) diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index f064e861a0f01..e0c7a5e673b8c 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1826,6 +1826,12 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2 +; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2 +; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = shl <4 x i64> %a0, @@ -1840,6 +1846,9 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epu32: ; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = and <4 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 2c8c4693d69c4..f889bb905505c 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1718,6 +1718,11 @@ entry: define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epu32: ; CHECK: # %bb.0: +; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: knotw %k0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = and <8 x i64> %__A, diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index d5614fe8daa51..36f06a64a6540 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1318,55 +1318,76 @@ entry: define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm8 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE2-NEXT: pmuludq %xmm9, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: psllq $32, %xmm4 ; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: pmuludq %xmm11, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: pmuludq %xmm10, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE2-NEXT: pmuludq %xmm10, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: paddq %xmm6, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE2-NEXT: pmuludq %xmm8, %xmm3 -; SSE2-NEXT: psllq $32, %xmm3 -; SSE2-NEXT: paddq %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: paddq %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm6, %xmm4 +; SSE2-NEXT: paddq %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: pmuludq %xmm5, %xmm4 +; SSE2-NEXT: paddq %xmm6, %xmm4 +; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 06793e5d33041..83d3a0e0b9546 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2755,13 +2755,23 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind { declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { -; SSE-LABEL: test_mm_mul_epu32: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-SSE-LABEL: test_mm_mul_epu32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] +; X86-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] +; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_mul_epu32: ; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2] +; AVX1-NEXT: vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc] +; AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc] +; AVX1-NEXT: # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; @@ -2774,6 +2784,16 @@ define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; X64-SSE-LABEL: test_mm_mul_epu32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] +; X64-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] +; X64-SSE-NEXT: retq # encoding: [0xc3] %A = and <2 x i64> %a0, %B = and <2 x i64> %a1, %res = mul nuw <2 x i64> %A, %B diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll index 9990ac00eb054..dd82bef1d1113 100644 --- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -832,11 +832,26 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_mul_epi32: ; SSE: # %bb.0: -; SSE-NEXT: pmuldq %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE-NEXT: pmuldq %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: ; AVX1: # %bb.0: +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} ; diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index fef9cde64b2e0..f039fa15d3b31 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -460,7 +460,7 @@ define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind { define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = <17,u,65,u> +; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -809,7 +809,7 @@ define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind { define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = <15,u,63,u> +; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -845,17 +845,16 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967281,u,4294967233,u> -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: movdqa %xmm2, %xmm3 -; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm1, %xmm3 -; X86-NEXT: psllq $32, %xmm3 -; X86-NEXT: pmuludq %xmm2, %xmm0 -; X86-NEXT: paddq %xmm3, %xmm0 +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: psrlq $32, %xmm2 +; X86-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: paddq %xmm1, %xmm2 +; X86-NEXT: psllq $32, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm0 +; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_15_63: @@ -890,17 +889,16 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967279,u,4294967231,u> -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: movdqa %xmm2, %xmm3 -; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm1, %xmm3 -; X86-NEXT: psllq $32, %xmm3 -; X86-NEXT: pmuludq %xmm2, %xmm0 -; X86-NEXT: paddq %xmm3, %xmm0 +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm0, %xmm2 +; X86-NEXT: psrlq $32, %xmm2 +; X86-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: paddq %xmm1, %xmm2 +; X86-NEXT: psllq $32, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm0 +; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_17_65: @@ -935,7 +933,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_0_1: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,u,1,u> +; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -977,7 +975,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295] ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1031,7 +1029,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = <15,u,4294967233,u> +; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295] ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1174,7 +1172,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_68_132: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = <68,u,132,u> +; X86-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -1210,7 +1208,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_60_120: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = <60,u,124,u> +; X86-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index d274bc9b2d194..d9f186e64f1d9 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -5595,29 +5595,40 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; SSE-NEXT: paddd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrad $31, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm0, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: