diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b4299d541079d..7db5770550db9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34444,6 +34444,25 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } + // Attempt to combine to INSERTPS, but only if the inserted element has come + // from a scalar. + // TODO: Handle other insertions here as well? + if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && + MaskEltSizeInBits == 32 && Subtarget.hasSSE41() && + !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) { + SDValue SrcV1 = V1, SrcV2 = V2; + if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) && + SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) + return SDValue(); // Nothing to do! + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, + DAG.getBitcast(MVT::v4f32, SrcV1), + DAG.getBitcast(MVT::v4f32, SrcV2), + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } + SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll index b80f6fa82d8b8..84ebabc927c16 100644 --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -540,17 +540,11 @@ define <4 x float> @PR37502(float %x, float %y) { ; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-32-NEXT: retl ; -; AVX1-64-LABEL: PR37502: -; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-64-NEXT: retq -; -; AVX2-64-LABEL: PR37502: -; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2-64-NEXT: retq +; AVX-64-LABEL: PR37502: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX-64-NEXT: retq %i0 = insertelement <4 x float> undef, float %x, i32 0 %i1 = insertelement <4 x float> %i0, float %y, i32 1 %i2 = insertelement <4 x float> %i1, float %x, i32 2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 4a10a20bc5ff7..f813fec5cab9e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -288,7 +288,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { ; ; X64-AVX2-LABEL: buildvector_v4f32_0404: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; X64-AVX2-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX2-NEXT: retq