diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 38be3a82af658..2ce4fa51692b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9987,19 +9987,29 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, MaskSize == (int)ExpectedOp.getNumOperands()) return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); break; - case ISD::BITCAST: - if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) { - SDValue Src = peekThroughBitcasts(Op); - EVT SrcVT = Src.getValueType(); - if (SrcVT.isVector() && - (SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) { + case ISD::BITCAST: { + SDValue Src = peekThroughBitcasts(Op); + EVT SrcVT = Src.getValueType(); + if (Op == ExpectedOp && SrcVT.isVector() && + (int)VT.getVectorNumElements() == MaskSize) { + if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) { unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits(); return (Idx % Scale) == (ExpectedIdx % Scale) && IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src, Idx / Scale, ExpectedIdx / Scale); } + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) { + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + for (unsigned I = 0; I != Scale; ++I) + if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src, + (Idx * Scale) + I, + (ExpectedIdx * Scale) + I)) + return false; + return true; + } } break; + } case ISD::VECTOR_SHUFFLE: { auto *SVN = cast(Op); return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize && diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll index 632f3c6c1e851..2609b06361af5 100644 --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -532,10 +532,10 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3] ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index e2cc3ae0dca0a..443275e11459d 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -179,8 +179,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1] +; SSSE3-SLOW-NEXT: movaps %xmm6, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -345,8 +345,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 -; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: @@ -374,7 +373,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] @@ -397,7 +396,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] @@ -422,7 +421,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] @@ -445,7 +444,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 8510b1031d717..1bbf92e45fc6c 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3138,10 +3138,10 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3] ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> @@ -3272,8 +3272,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -3404,8 +3404,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -4107,8 +4107,8 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX512-NEXT: vmovdqa %xmm0, (%rbx) ; AVX512-NEXT: addq $64, %rsp ; AVX512-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index a166bebae721c..98b5bab98c4f9 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1569,11 +1569,12 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_0_1: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: xorpd %xmm1, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: retl +; SSE2-LABEL: mul_v2i64_0_1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: ret{{[l|q]}} ; ; SSE4-LABEL: mul_v2i64_0_1: ; SSE4: # %bb.0: @@ -1581,13 +1582,6 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE4-NEXT: ret{{[l|q]}} ; -; X64-SSE2-LABEL: mul_v2i64_0_1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: xorps %xmm1, %xmm1 -; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movaps %xmm1, %xmm0 -; X64-SSE2-NEXT: retq -; ; X64-AVX-LABEL: mul_v2i64_0_1: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index fed72ff5c2b39..7fbb211b69ccf 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -7065,17 +7065,29 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15] +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias