diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 63e5ba859b5bee..38f0f52a04408d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34551,6 +34551,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, return V; } +// Attempt to commute shufps LHS loads: +// permilps(shufps(load(),x)) --> permilps(shufps(x,load())) +static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, + SelectionDAG &DAG) { + // TODO: Add general vXf32 + vXf64 support. + if (VT != MVT::v4f32) + return SDValue(); + + // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not. + auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) { + if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode())) + return SDValue(); + SDValue N0 = V.getOperand(0); + SDValue N1 = V.getOperand(1); + unsigned Imm = V.getConstantOperandVal(2); + if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) || + MayFoldLoad(peekThroughOneUseBitcasts(N1))) + return SDValue(); + Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); + return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, + DAG.getTargetConstant(Imm, DL, MVT::i8)); + }; + + switch (N.getOpcode()) { + case X86ISD::VPERMILPI: + if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) { + unsigned Imm = N.getConstantOperandVal(1); + return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); + } + break; + case X86ISD::SHUFP: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + unsigned Imm = N.getConstantOperandVal(2); + if (N0 == N1) { + if (SDValue NewSHUFP = commuteSHUFP(N, N0)) + return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); + } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) { + return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1, + DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8)); + } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) { + return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8)); + } + break; + } + } + + return SDValue(); +} + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -34588,27 +34641,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } } - // Attempt to commute shufps LHS loads: - // permilps(shufps(load(),x)) --> permilps(shufps(x,load())) - if (VT == MVT::v4f32 && - (X86ISD::VPERMILPI == Opcode || - (X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) { - SDValue N0 = N.getOperand(0); - unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2); - if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) { - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); - if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) && - !MayFoldLoad(peekThroughOneUseBitcasts(N01))) { - unsigned Imm1 = N0.getConstantOperandVal(2); - Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4); - SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00, - DAG.getTargetConstant(Imm1, DL, MVT::i8)); - return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0, - DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); - } - } - } + if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) + return R; switch (Opcode) { case X86ISD::VBROADCAST: { diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 8460e8666bf4da..20db28ca3b2a5c 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1198,44 +1198,42 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm10 ; SSE2-NEXT: movups 80(%rdi), %xmm8 +; SSE2-NEXT: movups 64(%rdi), %xmm4 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm11 -; SSE2-NEXT: movups 32(%rdi), %xmm5 -; SSE2-NEXT: movdqu 48(%rdi), %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] -; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,0] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0] -; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,0] -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] -; SSE2-NEXT: movups %xmm9, 16(%rsi) -; SSE2-NEXT: movups %xmm3, (%rsi) -; SSE2-NEXT: movups %xmm2, 16(%rdx) +; SSE2-NEXT: movups 16(%rdi), %xmm6 +; SSE2-NEXT: movups 32(%rdi), %xmm10 +; SSE2-NEXT: movups 48(%rdi), %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2] +; SSE2-NEXT: movaps %xmm12, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0] +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm8[2,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,0] +; SSE2-NEXT: movups %xmm12, 16(%rsi) +; SSE2-NEXT: movups %xmm11, (%rsi) +; SSE2-NEXT: movups %xmm6, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movups %xmm1, 16(%rcx) -; SSE2-NEXT: movups %xmm6, (%rcx) +; SSE2-NEXT: movups %xmm5, 16(%rcx) +; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll index cd9d2692ff3f92..c6815a278f829e 100644 --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -34,18 +34,18 @@ define <4 x float> @t2(<4 x float>* %P) nounwind { ; X32-LABEL: t2: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps (%eax), %xmm1 ; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; X32-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm1 ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > @@ -74,18 +74,18 @@ define <4 x float> @t4(<4 x float>* %P) nounwind { ; X32-LABEL: t4: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps (%eax), %xmm0 ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; X32-NEXT: xorps %xmm0, %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X32-NEXT: retl ; ; X64-LABEL: t4: ; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm0 ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %P %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index dd67f9bfc43004..9ba639784e1d67 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2496,16 +2496,15 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) { define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) { ; SSE-LABEL: shuffle_mem_v4f32_4760: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovaps (%rdi), %xmm1 -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_mem_v4f32_4760: