Skip to content

Commit c88f724

Browse files
committed
[X86] Prefer blendps over insertps codegen for one special case
With this patch, for this one exact case, we'll generate: blendps %xmm0, %xmm1, $1 instead of: insertps %xmm0, %xmm1, $0 If there's a memory operand available for load folding and we're optimizing for size, we'll still generate the insertps. The detailed performance data motivation for this may be found in D7866; in summary, blendps has 2-3x throughput vs. insertps on widely used chips. Differential Revision: http://reviews.llvm.org/D8332 llvm-svn: 232850
1 parent 03ad616 commit c88f724

File tree

2 files changed

+54
-18
lines changed

2 files changed

+54
-18
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1055010550
}
1055110551

1055210552
if (EltVT == MVT::f32) {
10553-
// Bits [7:6] of the constant are the source select. This will always be
10554-
// zero here. The DAG Combiner may combine an extract_elt index into
10555-
// these
10556-
// bits. For example (insert (extract, 3), 2) could be matched by
10557-
// putting
10558-
// the '3' into bits [7:6] of X86ISD::INSERTPS.
10559-
// Bits [5:4] of the constant are the destination select. This is the
10560-
// value of the incoming immediate.
10561-
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
10553+
// Bits [7:6] of the constant are the source select. This will always be
10554+
// zero here. The DAG Combiner may combine an extract_elt index into
10555+
// these bits. For example (insert (extract, 3), 2) could be matched by
10556+
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
10557+
// Bits [5:4] of the constant are the destination select. This is the
10558+
// value of the incoming immediate.
10559+
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
1056210560
// combine either bitwise AND or insert of float 0.0 to set these bits.
10561+
10562+
const Function *F = DAG.getMachineFunction().getFunction();
10563+
bool MinSize = F->hasFnAttribute(Attribute::MinSize);
10564+
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
10565+
// If this is an insertion of 32-bits into the low 32-bits of
10566+
// a vector, we prefer to generate a blend with immediate rather
10567+
// than an insertps. Blends are simpler operations in hardware and so
10568+
// will always have equal or better performance than insertps.
10569+
// But if optimizing for size and there's a load folding opportunity,
10570+
// generate insertps because blendps does not have a 32-bit memory
10571+
// operand form.
10572+
N2 = DAG.getIntPtrConstant(1);
10573+
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
10574+
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
10575+
}
1056310576
N2 = DAG.getIntPtrConstant(IdxVal << 4);
1056410577
// Create this as a scalar to vector..
1056510578
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

llvm/test/CodeGen/X86/sse41.ll

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
199199

200200
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
201201

202-
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
203-
; X32-LABEL: insertps_2:
202+
; When optimizing for speed, prefer blendps over insertps even if it means we have to
203+
; generate a separate movss to load the scalar operand.
204+
define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
205+
; X32-LABEL: blendps_not_insertps_1:
206+
; X32: ## BB#0:
207+
; X32-NEXT: movss {{.*#+}} xmm1
208+
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
209+
; X32-NEXT: retl
210+
;
211+
; X64-LABEL: blendps_not_insertps_1:
212+
; X64: ## BB#0:
213+
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
214+
; X64-NEXT: retq
215+
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
216+
ret <4 x float> %tmp1
217+
}
218+
219+
; When optimizing for size, generate an insertps if there's a load fold opportunity.
220+
; The difference between i386 and x86-64 ABIs for the float operand means we should
221+
; generate an insertps for X32 but not for X64!
222+
define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
223+
; X32-LABEL: insertps_or_blendps:
204224
; X32: ## BB#0:
205225
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
206226
; X32-NEXT: retl
207227
;
208-
; X64-LABEL: insertps_2:
228+
; X64-LABEL: insertps_or_blendps:
209229
; X64: ## BB#0:
210-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
230+
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
211231
; X64-NEXT: retq
212232
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
213233
ret <4 x float> %tmp1
214234
}
215-
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
216-
; X32-LABEL: insertps_3:
235+
236+
; An insert into the low 32-bits of a vector from the low 32-bits of another vector
237+
; is always just a blendps because blendps is never more expensive than insertps.
238+
define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
239+
; X32-LABEL: blendps_not_insertps_2:
217240
; X32: ## BB#0:
218-
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
241+
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
219242
; X32-NEXT: retl
220243
;
221-
; X64-LABEL: insertps_3:
244+
; X64-LABEL: blendps_not_insertps_2:
222245
; X64: ## BB#0:
223-
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
246+
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
224247
; X64-NEXT: retq
225248
%tmp2 = extractelement <4 x float> %t2, i32 0
226249
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0

0 commit comments

Comments
 (0)