Skip to content

Commit

Permalink
[X86][SSE] combineCommutableSHUFP - permilps(shufps(load(),x)) --> pe…
Browse files Browse the repository at this point in the history
…rmilps(shufps(x,load()))

Pull out combineTargetShuffle code added in rG3fd5d1c6e7db into a helper function and extend it to handle shufps(shufps(load(),x),y) and shufps(y,shufps(load(),x)) cases as well.
  • Loading branch information
RKSimon committed Jan 26, 2020
1 parent 4aea70e commit 1a81b29
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 74 deletions.
76 changes: 55 additions & 21 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -34551,6 +34551,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}

// Attempt to commute shufps LHS loads:
// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
SelectionDAG &DAG) {
// TODO: Add general vXf32 + vXf64 support.
if (VT != MVT::v4f32)
return SDValue();

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
return SDValue();
SDValue N0 = V.getOperand(0);
SDValue N1 = V.getOperand(1);
unsigned Imm = V.getConstantOperandVal(2);
if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
MayFoldLoad(peekThroughOneUseBitcasts(N1)))
return SDValue();
Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
};

switch (N.getOpcode()) {
case X86ISD::VPERMILPI:
if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
unsigned Imm = N.getConstantOperandVal(1);
return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
}
break;
case X86ISD::SHUFP: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
unsigned Imm = N.getConstantOperandVal(2);
if (N0 == N1) {
if (SDValue NewSHUFP = commuteSHUFP(N, N0))
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
}
break;
}
}

return SDValue();
}

/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
Expand Down Expand Up @@ -34588,27 +34641,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}

// Attempt to commute shufps LHS loads:
// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
if (VT == MVT::v4f32 &&
(X86ISD::VPERMILPI == Opcode ||
(X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) {
SDValue N0 = N.getOperand(0);
unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2);
if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) &&
!MayFoldLoad(peekThroughOneUseBitcasts(N01))) {
unsigned Imm1 = N0.getConstantOperandVal(2);
Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4);
SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00,
DAG.getTargetConstant(Imm1, DL, MVT::i8));
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
}
}
}
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;

switch (Opcode) {
case X86ISD::VBROADCAST: {
Expand Down
68 changes: 33 additions & 35 deletions llvm/test/CodeGen/X86/oddshuffles.ll
Expand Up @@ -1198,44 +1198,42 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu 64(%rdi), %xmm10
; SSE2-NEXT: movups 80(%rdi), %xmm8
; SSE2-NEXT: movups 64(%rdi), %xmm4
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu 16(%rdi), %xmm11
; SSE2-NEXT: movups 32(%rdi), %xmm5
; SSE2-NEXT: movdqu 48(%rdi), %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
; SSE2-NEXT: movaps %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0]
; SSE2-NEXT: movaps %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,0]
; SSE2-NEXT: movdqa %xmm9, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
; SSE2-NEXT: movaps %xmm2, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0]
; SSE2-NEXT: movups %xmm9, 16(%rsi)
; SSE2-NEXT: movups %xmm3, (%rsi)
; SSE2-NEXT: movups %xmm2, 16(%rdx)
; SSE2-NEXT: movups 16(%rdi), %xmm6
; SSE2-NEXT: movups 32(%rdi), %xmm10
; SSE2-NEXT: movups 48(%rdi), %xmm12
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2]
; SSE2-NEXT: movaps %xmm12, %xmm6
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0]
; SSE2-NEXT: movaps %xmm6, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm8[2,0]
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,0]
; SSE2-NEXT: movups %xmm12, 16(%rsi)
; SSE2-NEXT: movups %xmm11, (%rsi)
; SSE2-NEXT: movups %xmm6, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
; SSE2-NEXT: movups %xmm1, 16(%rcx)
; SSE2-NEXT: movups %xmm6, (%rcx)
; SSE2-NEXT: movups %xmm5, 16(%rcx)
; SSE2-NEXT: movups %xmm7, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/vec_insert-5.ll
Expand Up @@ -34,18 +34,18 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
; X32-LABEL: t2:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm1
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X32-NEXT: retl
;
; X64-LABEL: t2:
; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm1
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
Expand Down Expand Up @@ -74,18 +74,18 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
; X32-LABEL: t4:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X32-NEXT: retl
;
; X64-LABEL: t4:
; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
Expand Up @@ -2496,16 +2496,15 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) {
; SSE-LABEL: shuffle_mem_v4f32_4760:
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovaps (%rdi), %xmm1
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
Expand Down

0 comments on commit 1a81b29

Please sign in to comment.