Skip to content

Commit

Permalink
Optimization for certain shufflevector by using insertps.
Browse files Browse the repository at this point in the history
Summary:
If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower
certain shufflevectors to an insertps instruction:
When most of the shufflevector result's elements come from one vector (and
keep their index), and one element comes from another vector or a memory
operand.

Added tests for insertps optimizations on shufflevector.
Added support and tests for v4i32 vector optimization.

Reviewers: nadav

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D3475

llvm-svn: 207291
  • Loading branch information
filcab committed Apr 25, 2014
1 parent 42292ce commit 363b570
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 2 deletions.
104 changes: 104 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -3931,6 +3931,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
return true;
}

/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to INSERTPS.
/// i. e: If all but one element come from the same vector.
static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
// TODO: Deal with AVX's VINSERTPS
if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
return false;

unsigned CorrectPosV1 = 0;
unsigned CorrectPosV2 = 0;
for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
if (Mask[i] == i)
++CorrectPosV1;
else if (Mask[i] == i + 4)
++CorrectPosV2;

if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
// We have 3 elements from one vector, and one from another.
return true;

return false;
}

//
// Some special combinations that can be optimized.
//
Expand Down Expand Up @@ -7263,6 +7286,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
getShuffleSHUFImmediate(SVOp), DAG);
}

// It is only safe to call this function if isINSERTPSMask is true for
// this shufflevector mask.
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
SelectionDAG &DAG) {
// Generate an insertps instruction when inserting an f32 from memory onto a
// v4f32 or when copying a member from one v4f32 to another.
// We also use it for transferring i32 from one register to another,
// since it simply copies the same bits.
// If we're transfering an i32 from memory to a specific element in a
// register, we output a generic DAG that will match the PINSRD
// instruction.
// TODO: Optimize for AVX cases too (VINSERTPS)
MVT VT = SVOp->getSimpleValueType(0);
MVT EVT = VT.getVectorElementType();
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
auto Mask = SVOp->getMask();
assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
"unsupported vector type for insertps/pinsrd");

int FromV1 = std::count_if(Mask.begin(), Mask.end(),
[](const int &i) { return i < 4; });

SDValue From;
SDValue To;
unsigned DestIndex;
if (FromV1 == 1) {
From = V1;
To = V2;
DestIndex = std::find_if(Mask.begin(), Mask.end(),
[](const int &i) { return i < 4; }) -
Mask.begin();
} else {
From = V2;
To = V1;
DestIndex = std::find_if(Mask.begin(), Mask.end(),
[](const int &i) { return i >= 4; }) -
Mask.begin();
}

if (MayFoldLoad(From)) {
// Trivial case, when From comes from a load and is only used by the
// shuffle. Make it use insertps from the vector that we need from that
// load.
SDValue Addr = From.getOperand(1);
SDValue NewAddr =
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
DAG.getConstant(DestIndex * EVT.getStoreSize(),
Addr.getSimpleValueType()));

LoadSDNode *Load = cast<LoadSDNode>(From);
SDValue NewLoad =
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Load->getMemOperand(), 0, EVT.getStoreSize()));

if (EVT == MVT::f32) {
// Create this as a scalar to vector to match the instruction pattern.
SDValue LoadScalarToVector =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
InsertpsMask);
} else { // EVT == MVT::i32
// If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
// instruction, to match the PINSRD instruction, which loads an i32 to a
// certain vector element.
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
DAG.getConstant(DestIndex, MVT::i32));
}
}

// Vector-element-to-vector
unsigned SrcIndex = Mask[DestIndex] % 4;
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
}

// Reduce a vector shuffle to zext.
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
Expand Down Expand Up @@ -7674,6 +7775,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (BlendOp.getNode())
return BlendOp;

if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
return getINSERTPS(SVOp, dl, DAG);

unsigned Imm8;
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
Expand Down
75 changes: 73 additions & 2 deletions llvm/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK

@g16 = external global i16

Expand Down Expand Up @@ -249,3 +249,74 @@ entry:
; X64: ret
}

define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
entry:
%0 = load <4 x float>* %pb, align 16
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x float> %vecinit6
; CHECK-LABEL: insertps_from_shufflevector_1:
; CHECK-NOT: shufps
; CHECK: insertps $48,
; CHECK: ret
}

define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
entry:
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
ret <4 x float> %vecinit6
; CHECK-LABEL: insertps_from_shufflevector_2:
; CHECK-NOT: mov
; CHECK-NOT: shufps
; CHECK: insertps $96,
; CHECK: ret
}

; For loading an i32 from memory into an xmm register we use pinsrd
; instead of insertps
define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
entry:
%0 = load <4 x i32>* %pb, align 16
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %vecinit6
; CHECK-LABEL: pinsrd_from_shufflevector_i32:
; CHECK-NOT: mov
; CHECK-NOT: shufps
; CHECK: pinsrd $3,
; CHECK: ret
}

define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
entry:
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
ret <4 x i32> %vecinit6
; CHECK-LABEL: insertps_from_shufflevector_i32_2:
; CHECK-NOT: mov
; CHECK-NOT: shufps
; CHECK: insertps $208,
; CHECK: ret
}

define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
; CHECK-LABEL: insertps_from_load_ins_elt_undef:
; CHECK-NOT: mov
; CHECK-NOT: shufps
; CHECK: insertps $16,
; CHECK: ret
%1 = load float* %b, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x float> %result
}

define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
;; aCHECK-NOT: mov
; CHECK-NOT: shufps
; CHECK: insertps $32,
; CHECK: ret
%1 = load i32* %b, align 4
%2 = insertelement <4 x i32> undef, i32 %1, i32 0
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %result
}

0 comments on commit 363b570

Please sign in to comment.