Skip to content

Commit

Permalink
[ARM] Workaround tailpredication min/max costmodel
Browse files Browse the repository at this point in the history
The min/max intrinsics are not yet canonical, but when they are the tail
predications analysis will change from treating them like icmp to
treating them like intrinsics. Unfortunately, they can currently produce
better code by not being tail predicated thanks to the vectorizer picking
higher VF's and the backend folding to better instructions (especially
for saturate patterns). In the long run we will need to improve the
vectorizers cost modelling, recognizing the instruction directly, but in
the meantime this treats min/max as before to prevent performance
regressions.
  • Loading branch information
davemgreen committed Aug 30, 2021
1 parent 83df940 commit efa340f
Show file tree
Hide file tree
Showing 2 changed files with 241 additions and 0 deletions.
14 changes: 14 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Expand Up @@ -1960,6 +1960,20 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
// we simply count the icmps, i.e. there should only be 1 for the backedge.
if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
return false;
// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
// not currently canonical, but soon will be. Code without them uses icmp, and
// so is not tail predicated as per the condition above. In order to get the
// same performance we treat min and max the same as an icmp for tailpred
// purposes for the moment (we often rely on non-tailpred and higher VF's to
// pick more optimial instructions like VQDMULH. They need to be recognized
// directly by the vectorizer).
if (auto *II = dyn_cast<IntrinsicInst>(&I))
if ((II->getIntrinsicID() == Intrinsic::smin ||
II->getIntrinsicID() == Intrinsic::smax ||
II->getIntrinsicID() == Intrinsic::umin ||
II->getIntrinsicID() == Intrinsic::umax) &&
++ICmpCount > 1)
return false;

if (isa<FCmpInst>(&I))
return false;
Expand Down
227 changes: 227 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -0,0 +1,227 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s

; This test after a lot of cleanup should produce pick a tail-predicated 8x
; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
; FIXME: Tailpredicate too, but not at the expense of 8x vectorized.

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-none-eabi"

define void @arm_mult_q15(i16* %pSrcA, i16* %pSrcB, i16 * noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @arm_mult_q15(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP_NOT3:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
; CHECK-NEXT: br i1 [[CMP_NOT3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
; CHECK: while.body.preheader:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i16, i16* [[PSRCA:%.*]], i32 [[N_VEC]]
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i32 [[N_VEC]]
; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i16, i16* [[PSRCB:%.*]], i32 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRCA]], i32 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i16, i16* [[PSRCB]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP15]] to <8 x i16>*
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i16>, <8 x i16>* [[TMP2]], align 2
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD16]] to <8 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
; CHECK-NEXT: [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[NEXT_GEP14]] to <8 x i16>*
; CHECK-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER17]]
; CHECK: while.body.preheader17:
; CHECK-NEXT: [[BLKCNT_07_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[PSRCA_ADDR_06_PH:%.*]] = phi i16* [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[PDST_ADDR_05_PH:%.*]] = phi i16* [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[PSRCB_ADDR_04_PH:%.*]] = phi i16* [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_07:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_07_PH]], [[WHILE_BODY_PREHEADER17]] ]
; CHECK-NEXT: [[PSRCA_ADDR_06:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_06_PH]], [[WHILE_BODY_PREHEADER17]] ]
; CHECK-NEXT: [[PDST_ADDR_05:%.*]] = phi i16* [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_05_PH]], [[WHILE_BODY_PREHEADER17]] ]
; CHECK-NEXT: [[PSRCB_ADDR_04:%.*]] = phi i16* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_04_PH]], [[WHILE_BODY_PREHEADER17]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_06]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[PSRCA_ADDR_06]], align 2
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_04]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRCB_ADDR_04]], align 2
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], 15
; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[SHR]], 32767
; CHECK-NEXT: [[RETVAL_1_I:%.*]] = select i1 [[TMP13]], i32 [[SHR]], i32 32767
; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[RETVAL_1_I]] to i16
; CHECK-NEXT: [[INCDEC_PTR4]] = getelementptr inbounds i16, i16* [[PDST_ADDR_05]], i32 1
; CHECK-NEXT: store i16 [[CONV3]], i16* [[PDST_ADDR_05]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_07]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: while.end:
; CHECK-NEXT: ret void
;
entry:
%pSrcA.addr = alloca i16*, align 4
%pSrcB.addr = alloca i16*, align 4
%pDst.addr = alloca i16*, align 4
%blockSize.addr = alloca i32, align 4
%blkCnt = alloca i32, align 4
store i16* %pSrcA, i16** %pSrcA.addr, align 4
store i16* %pSrcB, i16** %pSrcB.addr, align 4
store i16* %pDst, i16** %pDst.addr, align 4
store i32 %blockSize, i32* %blockSize.addr, align 4
%0 = bitcast i32* %blkCnt to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
%1 = load i32, i32* %blockSize.addr, align 4
store i32 %1, i32* %blkCnt, align 4
br label %while.cond

while.cond: ; preds = %while.body, %entry
%2 = load i32, i32* %blkCnt, align 4
%cmp = icmp ugt i32 %2, 0
br i1 %cmp, label %while.body, label %while.end

while.body: ; preds = %while.cond
%3 = load i16*, i16** %pSrcA.addr, align 4
%incdec.ptr = getelementptr inbounds i16, i16* %3, i32 1
store i16* %incdec.ptr, i16** %pSrcA.addr, align 4
%4 = load i16, i16* %3, align 2
%conv = sext i16 %4 to i32
%5 = load i16*, i16** %pSrcB.addr, align 4
%incdec.ptr1 = getelementptr inbounds i16, i16* %5, i32 1
store i16* %incdec.ptr1, i16** %pSrcB.addr, align 4
%6 = load i16, i16* %5, align 2
%conv2 = sext i16 %6 to i32
%mul = mul nsw i32 %conv, %conv2
%shr = ashr i32 %mul, 15
%call = call i32 @__SSAT(i32 %shr, i32 16)
%conv3 = trunc i32 %call to i16
%7 = load i16*, i16** %pDst.addr, align 4
%incdec.ptr4 = getelementptr inbounds i16, i16* %7, i32 1
store i16* %incdec.ptr4, i16** %pDst.addr, align 4
store i16 %conv3, i16* %7, align 2
%8 = load i32, i32* %blkCnt, align 4
%dec = add i32 %8, -1
store i32 %dec, i32* %blkCnt, align 4
br label %while.cond

while.end: ; preds = %while.cond
%9 = bitcast i32* %blkCnt to i8*
call void @llvm.lifetime.end.p0i8(i64 4, i8* %9) #3
ret void
}

declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1

define internal i32 @__SSAT(i32 %val, i32 %sat) #2 {
entry:
%retval = alloca i32, align 4
%val.addr = alloca i32, align 4
%sat.addr = alloca i32, align 4
%max = alloca i32, align 4
%min = alloca i32, align 4
%cleanup.dest.slot = alloca i32, align 4
store i32 %val, i32* %val.addr, align 4
store i32 %sat, i32* %sat.addr, align 4
%0 = load i32, i32* %sat.addr, align 4
%cmp = icmp uge i32 %0, 1
br i1 %cmp, label %land.lhs.true, label %if.end10

land.lhs.true: ; preds = %entry
%1 = load i32, i32* %sat.addr, align 4
%cmp1 = icmp ule i32 %1, 32
br i1 %cmp1, label %if.then, label %if.end10

if.then: ; preds = %land.lhs.true
%2 = bitcast i32* %max to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
%3 = load i32, i32* %sat.addr, align 4
%sub = sub i32 %3, 1
%shl = shl i32 1, %sub
%sub2 = sub i32 %shl, 1
store i32 %sub2, i32* %max, align 4
%4 = bitcast i32* %min to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
%5 = load i32, i32* %max, align 4
%sub3 = sub nsw i32 -1, %5
store i32 %sub3, i32* %min, align 4
%6 = load i32, i32* %val.addr, align 4
%7 = load i32, i32* %max, align 4
%cmp4 = icmp sgt i32 %6, %7
br i1 %cmp4, label %if.then5, label %if.else

if.then5: ; preds = %if.then
%8 = load i32, i32* %max, align 4
store i32 %8, i32* %retval, align 4
store i32 1, i32* %cleanup.dest.slot, align 4
br label %cleanup

if.else: ; preds = %if.then
%9 = load i32, i32* %val.addr, align 4
%10 = load i32, i32* %min, align 4
%cmp6 = icmp slt i32 %9, %10
br i1 %cmp6, label %if.then7, label %if.end

if.then7: ; preds = %if.else
%11 = load i32, i32* %min, align 4
store i32 %11, i32* %retval, align 4
store i32 1, i32* %cleanup.dest.slot, align 4
br label %cleanup

if.end: ; preds = %if.else
br label %if.end8

if.end8: ; preds = %if.end
store i32 0, i32* %cleanup.dest.slot, align 4
br label %cleanup

cleanup: ; preds = %if.end8, %if.then7, %if.then5
%12 = bitcast i32* %min to i8*
call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #3
%13 = bitcast i32* %max to i8*
call void @llvm.lifetime.end.p0i8(i64 4, i8* %13) #3
%cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4
switch i32 %cleanup.dest, label %unreachable [
i32 0, label %cleanup.cont
i32 1, label %return
]

cleanup.cont: ; preds = %cleanup
br label %if.end10

if.end10: ; preds = %cleanup.cont, %land.lhs.true, %entry
%14 = load i32, i32* %val.addr, align 4
store i32 %14, i32* %retval, align 4
br label %return

return: ; preds = %if.end10, %cleanup
%15 = load i32, i32* %retval, align 4
ret i32 %15

unreachable: ; preds = %cleanup
unreachable
}

declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn }
attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
attributes #3 = { nounwind }

0 comments on commit efa340f

Please sign in to comment.