Skip to content

Commit

Permalink
[x86] split 256-bit vector selects if operands are vector concats
Browse files Browse the repository at this point in the history
This is similar logic/motivation to the select splitting in D62969.

In D63233, the pattern changes so that we no longer have an extract_subvector of vselect,
but the operands of the select are still being concatenated.

The closest case is represented in either the first or last test diffs here - we have an
extra instruction, but we converted 3-4 ymm instructions into 4-5 xmm instructions.
I think that's the right trade-off for most AVX1 targets.

In the example based on PR37428:
https://bugs.llvm.org/show_bug.cgi?id=37428
...this makes the loop about 30% faster (tested on Haswell by compiling with -mavx).

Differential Revision: https://reviews.llvm.org/D63364

llvm-svn: 363508
  • Loading branch information
rotateright committed Jun 16, 2019
1 parent fcffc2f commit d14389c
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 53 deletions.
36 changes: 36 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35542,6 +35542,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
}

/// If both arms of a vector select are concatenated vectors, split the select,
/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
return SDValue();

// TODO: Split 512-bit vectors too?
EVT VT = N->getValueType(0);
if (!VT.is256BitVector())
return SDValue();

// TODO: Split as long as any 2 of the 3 operands are concatenated?
SDValue Cond = N->getOperand(0);
SDValue TVal = N->getOperand(1);
SDValue FVal = N->getOperand(2);
SmallVector<SDValue, 4> CatOpsT, CatOpsF;
if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
!collectConcatOps(TVal.getNode(), CatOpsT) ||
!collectConcatOps(FVal.getNode(), CatOpsF))
return SDValue();

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
makeBlend, /*CheckBWI*/ false);
}

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
Expand Down Expand Up @@ -36105,6 +36138,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
return V;

// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
Expand Down
59 changes: 32 additions & 27 deletions llvm/test/CodeGen/X86/cast-vsel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,16 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; AVX1-LABEL: sext:
; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5
; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext:
Expand Down Expand Up @@ -95,12 +96,13 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vblendvps %xmm5, %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext:
Expand Down Expand Up @@ -403,6 +405,8 @@ for.end:
ret void
}

; TODO: AVX1 could have used 256-bit ops for a likely improvement.

define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2-LABEL: example24:
; SSE2: # %bb.0: # %vector.ph
Expand Down Expand Up @@ -469,26 +473,27 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB6_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovups da+4096(%rax), %ymm2
; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax)
; AVX1-NEXT: vmovups da+4096(%rax), %ymm4
; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm4, %ymm4
; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vmovaps %xmm4, dj+4112(%rax)
; AVX1-NEXT: vmovaps %xmm5, dj+4096(%rax)
; AVX1-NEXT: addq $32, %rax
; AVX1-NEXT: jne .LBB6_1
; AVX1-NEXT: # %bb.2: # %for.end
Expand Down
40 changes: 19 additions & 21 deletions llvm/test/CodeGen/X86/known-signbits-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -310,24 +310,23 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
; X32-NEXT: subl $16, %esp
; X32-NEXT: vpmovsxdq 16(%ebp), %xmm3
; X32-NEXT: vpmovsxdq 8(%ebp), %xmm4
; X32-NEXT: vextractf128 $1, %ymm2, %xmm5
; X32-NEXT: vpsrlq $33, %xmm5, %xmm5
; X32-NEXT: vpmovsxdq 8(%ebp), %xmm3
; X32-NEXT: vpmovsxdq 16(%ebp), %xmm4
; X32-NEXT: vpsrlq $33, %xmm2, %xmm5
; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [1073741824,0,1,0]
; X32-NEXT: vpxor %xmm6, %xmm5, %xmm5
; X32-NEXT: vpsubq %xmm6, %xmm5, %xmm5
; X32-NEXT: vextractf128 $1, %ymm2, %xmm2
; X32-NEXT: vpsrlq $33, %xmm2, %xmm2
; X32-NEXT: vpxor %xmm6, %xmm2, %xmm2
; X32-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; X32-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; X32-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; X32-NEXT: vextractf128 $1, %ymm1, %xmm4
; X32-NEXT: vextractf128 $1, %ymm0, %xmm5
; X32-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4
; X32-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X32-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm3
; X32-NEXT: vextractf128 $1, %ymm1, %xmm1
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; X32-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
; X32-NEXT: vblendvpd %xmm0, %xmm2, %xmm4, %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Expand All @@ -339,25 +338,24 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
;
; X64-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
; X64-NEXT: vpsrlq $33, %xmm4, %xmm4
; X64-NEXT: vpsrlq $33, %xmm2, %xmm4
; X64-NEXT: vmovdqa {{.*#+}} xmm5 = [1073741824,1]
; X64-NEXT: vpxor %xmm5, %xmm4, %xmm4
; X64-NEXT: vpsubq %xmm5, %xmm4, %xmm4
; X64-NEXT: vextractf128 $1, %ymm2, %xmm2
; X64-NEXT: vpsrlq $33, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm5, %xmm2, %xmm2
; X64-NEXT: vpsubq %xmm5, %xmm2, %xmm2
; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; X64-NEXT: vpmovsxdq %xmm3, %xmm4
; X64-NEXT: vpmovsxdq %xmm3, %xmm5
; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; X64-NEXT: vpmovsxdq %xmm3, %xmm3
; X64-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
; X64-NEXT: vextractf128 $1, %ymm0, %xmm5
; X64-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4
; X64-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X64-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4
; X64-NEXT: vextractf128 $1, %ymm1, %xmm1
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/X86/vselect-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
ret <32 x i8> %tmp
}

; TODO: Split a 256-bit select into two 128-bit selects when the operands are concatenated.
; Split a 256-bit select into two 128-bit selects when the operands are concatenated.

define void @blendv_split(<8 x i32>* %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
; AVX1-LABEL: blendv_split:
Expand All @@ -177,12 +177,13 @@ define void @blendv_split(<8 x i32>* %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vpslld %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpslld %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vblendvps %xmm0, %xmm5, %xmm4, %xmm0
; AVX1-NEXT: vmovups %xmm0, 16(%rdi)
; AVX1-NEXT: vmovups %xmm1, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
Expand Down

0 comments on commit d14389c

Please sign in to comment.