Skip to content

Commit

Permalink
[DagCombine] Improve DAGCombiner BUILD_VECTOR when it has two sources…
Browse files Browse the repository at this point in the history
… of elements

This partially fixes PR21943.

For AVX, we go from:

vmovq   (%rsi), %xmm0
vmovq   (%rdi), %xmm1
vpermilps       $-27, %xmm1, %xmm2 ## xmm2 = xmm1[1,1,2,3]
vinsertps       $16, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
vinsertps       $32, %xmm0, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
vpermilps       $-27, %xmm0, %xmm0 ## xmm0 = xmm0[1,1,2,3]
vinsertps       $48, %xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0,1,2],xmm0[0]

To the expected:

vmovq   (%rdi), %xmm0
vmovhpd (%rsi), %xmm0, %xmm0
retq

Fixing this for AVX2 is still open.

Differential Revision: http://reviews.llvm.org/D6749

llvm-svn: 224759
  • Loading branch information
Michael Kuperstein committed Dec 23, 2014
1 parent 04b16b5 commit f4536ea
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 12 deletions.
34 changes: 22 additions & 12 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -10832,6 +10832,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {

// If everything is good, we can make a shuffle operation.
if (VecIn1.getNode()) {
unsigned InNumElements = VecIn1.getValueType().getVectorNumElements();
SmallVector<int, 8> Mask;
for (unsigned i = 0; i != NumInScalars; ++i) {
unsigned Opcode = N->getOperand(i).getOpcode();
Expand All @@ -10858,8 +10859,8 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
continue;
}

// Otherwise, use InIdx + VecSize
Mask.push_back(NumInScalars+ExtIndex);
// Otherwise, use InIdx + InputVecSize
Mask.push_back(InNumElements + ExtIndex);
}

// Avoid introducing illegal shuffles with zero.
Expand All @@ -10869,26 +10870,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
// We can't generate a shuffle node with mismatched input and output types.
// Attempt to transform a single input vector to the correct type.
if ((VT != VecIn1.getValueType())) {
// We don't support shuffeling between TWO values of different types.
if (VecIn2.getNode())
return SDValue();

// If the input vector type has a different base type to the output
// vector type, bail out.
if (VecIn1.getValueType().getVectorElementType() !=
VT.getVectorElementType())
EVT VTElemType = VT.getVectorElementType();
if ((VecIn1.getValueType().getVectorElementType() != VTElemType) ||
(VecIn2.getNode() &&
(VecIn2.getValueType().getVectorElementType() != VTElemType)))
return SDValue();

// If the input vector is too small, widen it.
// We only support widening of vectors which are half the size of the
// output registers. For example XMM->YMM widening on X86 with AVX.
EVT VecInT = VecIn1.getValueType();
if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) {
// Widen the input vector by adding undef values.
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
// If we only have one small input, widen it by adding undef values.
if (!VecIn2.getNode())
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1,
DAG.getUNDEF(VecIn1.getValueType()));
else if (VecIn1.getValueType() == VecIn2.getValueType()) {
// If we have two small inputs of the same type, try to concat them.
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2);
VecIn2 = SDValue(nullptr, 0);
} else
return SDValue();
} else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
// If the input vector is too large, try to split it.
// We don't support having two input vectors that are too large.
if (VecIn2.getNode())
return SDValue();

if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
return SDValue();

Expand All @@ -10899,7 +10909,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
DAG.getConstant(0, TLI.getVectorIdxTy()));
UsesZeroVector = false;
} else
} else
return SDValue();
}

Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/X86/vector-shuffle-combining.ll
Expand Up @@ -1584,6 +1584,26 @@ define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
ret <4 x i32> %2
}

define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
; SSE-LABEL: combine_test22:
; SSE: # BB#0:
; SSE-NEXT: movq (%rdi), %xmm0
; SSE-NEXT: movhpd (%rsi), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_test22:
; AVX1: # BB#0:
; AVX1-NEXT: vmovq (%rdi), %xmm0
; AVX1-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; Current AVX2 lowering of this is still awful, not adding a test case.
%1 = load <2 x float>* %a, align 8
%2 = load <2 x float>* %b, align 8
%3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %3
}

; Check some negative cases.
; FIXME: Do any of these really make sense? Are they redundant with the above tests?

Expand Down

0 comments on commit f4536ea

Please sign in to comment.