diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 197aae6e03cb1..4ae143147e421 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -77,6 +77,10 @@ static cl::opt DMBLookaheadThreshold( "dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb")); +static cl::opt Aarch64ForceUnrollThreshold( + "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, + cl::desc("Threshold for forced unrolling of small loops in AArch64")); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -5250,6 +5254,7 @@ void AArch64TTIImpl::getUnrollingPreferences( // inlining. Don't unroll auto-vectorized loops either, though do allow // unrolling of the scalar remainder. bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); + InstructionCost Cost = 0; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { // Both auto-vectorized loops and the scalar remainder have the @@ -5264,6 +5269,10 @@ void AArch64TTIImpl::getUnrollingPreferences( continue; return; } + + SmallVector Operands(I.operand_values()); + Cost += getInstructionCost(&I, Operands, + TargetTransformInfo::TCK_SizeAndLatency); } } @@ -5310,6 +5319,11 @@ void AArch64TTIImpl::getUnrollingPreferences( UP.UnrollAndJam = true; UP.UnrollAndJamInnerLoopThreshold = 60; } + + // Force unrolling small loops can be very useful because of the branch + // taken cost of the backedge. + if (Cost < Aarch64ForceUnrollThreshold) + UP.Force = true; } void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll new file mode 100644 index 0000000000000..986df8bed8462 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll @@ -0,0 +1,90 @@ +; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE +; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE + +; The loop has a small runtime upper bound (at most four iterations) but a +; relatively expensive body. With runtime unrolling enabled, the cost model +; still leaves the loop rolled. Raising the AArch64 force threshold overrides +; that decision and unrolls. + +target triple = "aarch64-unknown-linux-gnu" + +define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) { +entry: + br label %loop + +; NOFORCE-LABEL: @force_small_loop( +; NOFORCE: loop: +; NOFORCE: br i1 %cond, label %body, label %exit +; NOFORCE: body: +; NOFORCE: store i32 %mix15, ptr %ptrb, align 4 +; NOFORCE: latch: +; NOFORCE: br i1 %cmp2, label %loop, label %exit +; NOFORCE: ret void +; NOFORCE-NOT: loop.1: +; +; FORCE-LABEL: @force_small_loop( +; FORCE: loop: +; FORCE: br i1 %cond, label %body, label %exit +; FORCE: loop.1: +; FORCE: br i1 true, label %body.1, label %exit +; FORCE: body.1: +; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4 +; FORCE: latch.1: +; FORCE: br i1 %cmp2.1, label %loop, label %exit +; FORCE: ret void + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %latch ] + %ptra = getelementptr inbounds i32, ptr %a, i32 %i + %pa = load i32, ptr %ptra, align 4 + %tmp0 = mul nsw i32 %pa, %pa + %tmp1 = add nsw i32 %tmp0, %pa + %tmp2 = shl i32 %tmp1, 1 + %tmp3 = ashr i32 %tmp2, 1 + %tmp4 = xor i32 %tmp3, %pa + %tmp5 = add nsw i32 %tmp4, 7 + %tmp6 = mul nsw i32 %tmp5, 5 + %tmp7 = add nsw i32 %tmp6, %tmp4 + %tmp8 = mul nsw i32 %tmp7, %tmp3 + %tmp9 = add nsw i32 %tmp8, %tmp7 + %tmp10 = xor i32 %tmp9, %tmp6 + %tmp11 = add nsw i32 %tmp10, %tmp8 + %tmp12 = mul nsw i32 %tmp11, 9 + %tmp13 = add nsw i32 %tmp12, %tmp10 + %tmp14 = xor i32 %tmp13, %tmp11 + %cond = icmp ult i32 %i, %n + br i1 %cond, label %body, label %exit + +body: + %ptrb = getelementptr inbounds i32, ptr %b, i32 %i + %pb = load i32, ptr %ptrb, align 4 + %sum = add nsw i32 %pb, %tmp14 + %diff = sub nsw i32 %sum, %pa + %mix1 = mul nsw i32 %diff, 3 + %mix2 = add nsw i32 %mix1, %tmp3 + %mix3 = xor i32 %mix2, %diff + %mix4 = add nsw i32 %mix3, %tmp0 + %mix5 = mul nsw i32 %mix4, 11 + %mix6 = add nsw i32 %mix5, %mix2 + %mix7 = xor i32 %mix6, %mix5 + %mix8 = add nsw i32 %mix7, %mix3 + %mix9 = mul nsw i32 %mix8, 13 + %mix10 = add nsw i32 %mix9, %mix8 + %mix11 = xor i32 %mix10, %mix7 + %mix12 = add nsw i32 %mix11, %mix6 + %mix13 = mul nsw i32 %mix12, 17 + %mix14 = add nsw i32 %mix13, %mix9 + %mix15 = xor i32 %mix14, %mix10 + store i32 %mix15, ptr %ptrb, align 4 + br label %latch + +latch: + %inc = add nuw nsw i32 %i, 1 + %cmp.limit = icmp ult i32 %n, 4 + %upper = select i1 %cmp.limit, i32 %n, i32 4 + %cmp2 = icmp ult i32 %inc, %upper + br i1 %cmp2, label %loop, label %exit + +exit: + ret void +}