Skip to content

Commit

Permalink
[LV] Adjust cost model to use uniform store lowering for unpredicated…
Browse files Browse the repository at this point in the history
… uniform stores

Follow up to D133580; adjust the cost model to prefer uniform store lowering for scalable stores which are unpredicated.

The impact here isn't in the uniform store lowering quality itself. InstCombine happily converts the scatter form into the single store form. The main impact is in letting the rest of the cost model make choices based on the knowledge that the vector will be scalarized on use.

Differential Revision: https://reviews.llvm.org/D134460
  • Loading branch information
preames committed Sep 27, 2022
1 parent ef89409 commit dc7387b
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 34 deletions.
12 changes: 9 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6823,6 +6823,13 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// Scalarization of fixed length vectors "just works".
return true;

// We have dedicated lowering for unpredicated uniform loads and
// stores. Note that even with tail folding we know that at least
// one lane is active (i.e. generalized predication is not possible
// here), and the logic below depends on this fact.
if (!foldTailByMasking())
return true;

// For scalable vectors, a uniform memop load is always
// uniform-by-parts and we know how to scalarize that.
if (isa<LoadInst>(I))
Expand All @@ -6840,12 +6847,11 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
// TODO: Avoid replicating loads and stores instead of relying on
// instcombine to remove them.
// FIXME: This cost is a significant under-estimate for tail folded
// memory ops.
const InstructionCost ScalarizationCost = isLegalToScalarize() ?
getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();


// Choose better solution for the current VF, Note that Invalid
// costs compare as maximumal large. If both are invalid, we get
// scalable invalid which signals a failure and a vectorization abort.
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16*> poison, i16* [[DST:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
Expand All @@ -25,12 +23,16 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP10]]
; CHECK-NEXT: store i16 [[TMP11]], i16* [[DST:%.*]], align 2
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]]
Expand Down
43 changes: 20 additions & 23 deletions llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -934,32 +934,29 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; SCALABLE-NEXT: [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
; SCALABLE-NEXT: [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
; SCALABLE-NEXT: [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]]
; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i32 0
; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
; SCALABLE-NEXT: store <vscale x 1 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP9]], align 8
; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; SCALABLE-NEXT: [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[INDEX]], i32 0
; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; SCALABLE-NEXT: [[TMP3:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP2]]
; SCALABLE-NEXT: [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; SCALABLE-NEXT: [[TMP5:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT]], [[TMP4]]
; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
; SCALABLE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
; SCALABLE-NEXT: [[TMP9:%.*]] = extractelement <vscale x 1 x i64> [[TMP5]], i32 [[TMP8]]
; SCALABLE-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8
; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]]
; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
; SCALABLE-NEXT: store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8
; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; SCALABLE: middle.block:
; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
Expand Down

0 comments on commit dc7387b

Please sign in to comment.