Skip to content

Commit

Permalink
[DAGCombiner] convert insertelement of bitcasted vector into shuffle
Browse files Browse the repository at this point in the history
Eg:
insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}

This is a generalization of the IR fold in D38316 to handle insertion into a non-undef vector. 
We may want to abandon that one if we can't find value in squashing the more specific pattern sooner.

We're using the existing legal shuffle target hook to avoid AVX512 horror with vXi1 shuffles.

There may be room for improvement in the shuffle lowering here, but that would be follow-up work.

Differential Revision: https://reviews.llvm.org/D38388

llvm-svn: 315460
  • Loading branch information
rotateright committed Oct 11, 2017
1 parent 3a4b7ec commit 34fd5ea
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 51 deletions.
65 changes: 62 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -415,6 +415,7 @@ namespace {
SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
SDValue CombineExtLoad(SDNode *N);
SDValue combineRepeatedFPDivisors(SDNode *N);
SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
Expand Down Expand Up @@ -13747,6 +13748,60 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
return St1;
}

/// Convert a disguised subvector insertion into a shuffle:
/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
/// bitcast(shuffle (bitcast V), (extended X), Mask)
/// Note: We do not use an insert_subvector node because that requires a legal
/// subvector type.
SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
SDValue InsertVal = N->getOperand(1);
if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
!InsertVal.getOperand(0).getValueType().isVector())
return SDValue();

SDValue SubVec = InsertVal.getOperand(0);
SDValue DestVec = N->getOperand(0);
EVT SubVecVT = SubVec.getValueType();
EVT VT = DestVec.getValueType();
unsigned NumSrcElts = SubVecVT.getVectorNumElements();
unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
unsigned NumMaskVals = ExtendRatio * NumSrcElts;

// Step 1: Create a shuffle mask that implements this insert operation. The
// vector that we are inserting into will be operand 0 of the shuffle, so
// those elements are just 'i'. The inserted subvector is in the first
// positions of operand 1 of the shuffle. Example:
// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
SmallVector<int, 16> Mask(NumMaskVals);
for (unsigned i = 0; i != NumMaskVals; ++i) {
if (i / NumSrcElts == InsIndex)
Mask[i] = (i % NumSrcElts) + NumMaskVals;
else
Mask[i] = i;
}

// Bail out if the target can not handle the shuffle we want to create.
EVT SubVecEltVT = SubVecVT.getVectorElementType();
EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
return SDValue();

// Step 2: Create a wide vector from the inserted source vector by appending
// undefined elements. This is the same size as our destination vector.
SDLoc DL(N);
SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
ConcatOps[0] = SubVec;
SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);

// Step 3: Shuffle in the padded subvector.
SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
AddToWorklist(PaddedSubV.getNode());
AddToWorklist(DestVecBC.getNode());
AddToWorklist(Shuf.getNode());
return DAG.getBitcast(VT, Shuf);
}

SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
Expand All @@ -13765,10 +13820,14 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
return InVec;

// Check that we know which element is being inserted
if (!isa<ConstantSDNode>(EltNo))
// We must know which element is being inserted for folds below here.
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
if (!IndexC)
return SDValue();
unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
unsigned Elt = IndexC->getZExtValue();

if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
return Shuf;

// Canonicalize insert_vector_elt dag nodes.
// Example:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
Expand Up @@ -140,7 +140,7 @@ define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {

define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
; CHECK-LABEL: ins1f2:
; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = extractelement <1 x double> %tmp1, i32 0
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
ret <2 x double> %tmp4
Expand Down
61 changes: 14 additions & 47 deletions llvm/test/CodeGen/X86/insertelement-shuffle.ll
Expand Up @@ -7,42 +7,34 @@
define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind {
; X32_AVX256-LABEL: insert_subvector_256:
; X32_AVX256: # BB#0:
; X32_AVX256-NEXT: pushl %eax
; X32_AVX256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32_AVX256-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32_AVX256-NEXT: vmovd %xmm1, (%esp)
; X32_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
; X32_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X32_AVX256-NEXT: popl %eax
; X32_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
; X32_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X32_AVX256-NEXT: retl
;
; X64_AVX256-LABEL: insert_subvector_256:
; X64_AVX256: # BB#0:
; X64_AVX256-NEXT: vmovd %edi, %xmm1
; X64_AVX256-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
; X64_AVX256-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp)
; X64_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
; X64_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X64_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X64_AVX256-NEXT: retq
;
; X32_AVX512-LABEL: insert_subvector_256:
; X32_AVX512: # BB#0:
; X32_AVX512-NEXT: pushl %eax
; X32_AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32_AVX512-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32_AVX512-NEXT: vmovd %xmm1, (%esp)
; X32_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
; X32_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X32_AVX512-NEXT: popl %eax
; X32_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
; X32_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_256:
; X64_AVX512: # BB#0:
; X64_AVX512-NEXT: vmovd %edi, %xmm1
; X64_AVX512-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
; X64_AVX512-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp)
; X64_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
; X64_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X64_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
; X64_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X64_AVX512-NEXT: retq
%ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0
%ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1
Expand Down Expand Up @@ -80,28 +72,17 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind
;
; X32_AVX512-LABEL: insert_subvector_512:
; X32_AVX512: # BB#0:
; X32_AVX512-NEXT: pushl %ebp
; X32_AVX512-NEXT: movl %esp, %ebp
; X32_AVX512-NEXT: andl $-8, %esp
; X32_AVX512-NEXT: subl $8, %esp
; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X32_AVX512-NEXT: vmovlps %xmm1, (%esp)
; X32_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X32_AVX512-NEXT: vpinsrd $0, (%esp), %xmm1, %xmm1
; X32_AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
; X32_AVX512-NEXT: movl %ebp, %esp
; X32_AVX512-NEXT: popl %ebp
; X32_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X32_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0]
; X32_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_512:
; X64_AVX512: # BB#0:
; X64_AVX512-NEXT: vmovd %edi, %xmm1
; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
; X64_AVX512-NEXT: vmovq %xmm1, %rax
; X64_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64_AVX512-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
; X64_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7]
; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X64_AVX512-NEXT: retq
%ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0
%ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1
Expand Down Expand Up @@ -144,22 +125,8 @@ define <8 x i64> @insert_subvector_into_undef(i32 %x0, i32 %x1) nounwind {
;
; X32_AVX512-LABEL: insert_subvector_into_undef:
; X32_AVX512: # BB#0:
; X32_AVX512-NEXT: pushl %ebp
; X32_AVX512-NEXT: movl %esp, %ebp
; X32_AVX512-NEXT: andl $-8, %esp
; X32_AVX512-NEXT: subl $8, %esp
; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32_AVX512-NEXT: vmovlps %xmm0, (%esp)
; X32_AVX512-NEXT: movl (%esp), %eax
; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32_AVX512-NEXT: vmovd %eax, %xmm0
; X32_AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32_AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X32_AVX512-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
; X32_AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32_AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; X32_AVX512-NEXT: movl %ebp, %esp
; X32_AVX512-NEXT: popl %ebp
; X32_AVX512-NEXT: vbroadcastsd %xmm0, %zmm0
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_into_undef:
Expand Down

0 comments on commit 34fd5ea

Please sign in to comment.