diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6198182..f02ca1367002a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ class LSRUse { /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop = true; + /// This records whether all of the fixups using this LSRUse are unconditional + /// within the loop, meaning they will be executed in every iteration of the + /// loop. + bool AllFixupsUnconditional = true; + /// RigidFormula is set to true to guarantee that this use will be associated /// with a single formula--the one that initially matched. Some SCEV /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { const SCEV *Start; - const SCEVConstant *Step; - if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) + const APInt *Step; + if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) { // If the step size matches the base offset, we could use pre-indexed // addressing. - if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && - Step->getAPInt() == F.BaseOffset.getFixedValue()) || - ((AMK & TTI::AMK_PostIndexed) && !isa(Start) && - SE->isLoopInvariant(Start, L))) + bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && + F.BaseOffset.isFixed() && + *Step == F.BaseOffset.getFixedValue(); + bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && + !isa(Start) && + SE->isLoopInvariant(Start, L); + // We can only pre or post index when the load/store is unconditional. + if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional) LoopCost = 0; + } } + // If the loop counts down to zero and we'll be using a hardware loop then // the addrec will be combined into the hardware loop instruction. if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const { if (AllFixupsOutsideLoop) OS << ", all-fixups-outside-loop"; + if (AllFixupsUnconditional) + OS << ", all-fixups-unconditional"; + if (WidestFixupType) OS << ", widest fixup type: " << *WidestFixupType; } @@ -2213,6 +2227,7 @@ class LSRInstance { void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const; void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = TmpPostIncLoops; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); // Create SCEV as Formula for calculating baseline cost if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { + // If the fixup block dominates the IV increment block then there is no path + // through the loop to the increment that doesn't pass through the fixup. + return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} + /// Check for other uses of loop-invariant values which we're tracking. These /// other uses will pin these values in registers, making them less profitable /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.OperandValToReplace = U; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional; // Transfer the fixups of LU to LUThatHas. for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index 9c36bae6fac13..ec257bcf123f3 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [r0] ; CHECK-NEXT: subs.w r9, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: and r6, r9, #3 ; CHECK-NEXT: subs r7, r1, #2 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: cbnz r6, .LBB0_7 +; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new ; CHECK-NEXT: bic r7, r9, #3 -; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r7, #4 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r10, [r0, #16]! -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrd r5, r4, [r0, #-12] -; CHECK-NEXT: ldr r11, [r0, #-4] +; CHECK-NEXT: ldr r11, [r0, #16]! +; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 -; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #3 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: cmp r5, r4 +; CHECK-NEXT: csinc r6, r10, r8, le +; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #2 -; CHECK-NEXT: csel r5, r4, r5, gt -; CHECK-NEXT: cmp r5, r11 +; CHECK-NEXT: addgt.w r6, r8, #2 +; CHECK-NEXT: csel r7, r7, r5, gt +; CHECK-NEXT: cmp r7, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #1 -; CHECK-NEXT: csel r5, r11, r5, gt -; CHECK-NEXT: cmp r5, r10 -; CHECK-NEXT: csel r6, r7, r6, gt -; CHECK-NEXT: add.w r7, r7, #4 -; CHECK-NEXT: csel r12, r10, r5, gt +; CHECK-NEXT: addgt.w r6, r8, #3 +; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: cmp r7, r11 +; CHECK-NEXT: csel r10, r8, r6, gt +; CHECK-NEXT: csel r12, r11, r7, gt ; CHECK-NEXT: le lr, .LBB0_5 -; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: beq .LBB0_10 -; CHECK-NEXT: @ %bb.7: @ %while.body.epil +; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit +; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload +; CHECK-NEXT: sub.w r9, r9, r8 +; CHECK-NEXT: cbz r6, .LBB0_10 +; CHECK-NEXT: .LBB0_7: @ %while.body.epil ; CHECK-NEXT: ldr r7, [r0, #4] ; CHECK-NEXT: sub.w r1, r1, r9 ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csel r6, r1, r6, gt +; CHECK-NEXT: csel r10, r1, r10, gt ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #1 +; CHECK-NEXT: cmp r6, #1 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1 ; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csinc r6, r6, r1, le +; CHECK-NEXT: csinc r10, r10, r1, le ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #2 +; CHECK-NEXT: cmp r6, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: cmp r12, r0 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt r6, r1, #2 +; CHECK-NEXT: addgt.w r10, r1, #2 ; CHECK-NEXT: csel r12, r0, r12, gt ; CHECK-NEXT: .LBB0_10: @ %while.end ; CHECK-NEXT: str.w r12, [r2] -; CHECK-NEXT: str r6, [r3] +; CHECK-NEXT: str.w r10, [r3] +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %0 = load i32, ptr %pSrc, align 4 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll index db30fd23b0c9d..1944a9c800355 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll @@ -119,8 +119,6 @@ for.end: ; We can't use postindex addressing on the conditional load of qval and can't ; convert the loop condition to a compare with zero, so we should instead use ; offset addressing. -; FIXME: Currently we don't notice the load of qval is conditional, and attempt -; postindex addressing anyway. define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-LABEL: define i32 @conditional_load( ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) { @@ -128,7 +126,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 @@ -136,6 +133,8 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]] ; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 2 +; CHECK-NEXT: [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]] ; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]] ; CHECK-NEXT: br label %[[FOR_INC]] @@ -143,7 +142,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1 ; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]] ; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: @@ -176,3 +174,141 @@ for.inc: exit: ret i32 %ret.next } + +; We can use postindex addressing for both loads here, even though the second +; may not be executed on every loop iteration. +define i32 @early_exit_load(ptr %p, ptr %q, ptr %n) { +; CHECK-LABEL: define i32 @early_exit_load( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] +; CHECK-NEXT: [[RET_PHI:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[PVAL]], 0 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[EXIT:.*]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[QVAL]], [[RET_PHI]] +; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1 +; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]] +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_PHI]], %[[FOR_BODY]] ], [ [[ADD]], %[[FOR_INC]] ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + br label %for.body + +for.body: + %ret.phi = phi i32 [ %add, %for.inc ], [ 0, %entry ] + %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ] + %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx + %pval = load i32, ptr %paddr, align 4 + %cmp1 = icmp eq i32 %pval, 0 + br i1 %cmp1, label %for.inc, label %exit + +for.inc: + %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx + %qval = load i32, ptr %qaddr, align 4 + %add = add nsw i32 %qval, %ret.phi + %idx.next = add nuw nsw i64 %idx, 1 + %nval = load volatile i64, ptr %n, align 8 + %cmp2 = icmp slt i64 %idx.next, %nval + br i1 %cmp2, label %for.body, label %exit + +exit: + %ret = phi i32 [ %ret.phi, %for.body ], [ %add, %for.inc ] + ret i32 %ret +} + +; The control-flow before and after the load of qval shouldn't prevent postindex +; addressing from happening. +; FIXME: We choose postindex addressing, but the scevgep is placed in for.inc so +; during codegen we will fail to actually generate a postindex load. +define void @middle_block_load(ptr %p, ptr %q, i64 %n) { +; CHECK-LABEL: define void @middle_block_load( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_INC]] ], [ [[N]], %[[ENTRY]] ] +; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV2]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[PVAL]], 0 +; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN1:.*]], label %[[IF_ELSE1:.*]] +; CHECK: [[IF_THEN1]]: +; CHECK-NEXT: tail call void @otherfn1() +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_ELSE1]]: +; CHECK-NEXT: tail call void @otherfn2() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[QVAL]], 0 +; CHECK-NEXT: br i1 [[CMP2]], label %[[IF_THEN2:.*]], label %[[IF_ELSE2:.*]] +; CHECK: [[IF_THEN2]]: +; CHECK-NEXT: tail call void @otherfn1() +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[IF_ELSE2]]: +; CHECK-NEXT: tail call void @otherfn2() +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[CMP3]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ] + %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx + %pval = load i32, ptr %paddr, align 4 + %cmp1 = icmp sgt i32 %pval, 0 + br i1 %cmp1, label %if.then1, label %if.else1 + +if.then1: + tail call void @otherfn1() + br label %if.end + +if.else1: + tail call void @otherfn2() + br label %if.end + +if.end: + %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx + %qval = load i32, ptr %qaddr, align 4 + %cmp2 = icmp sgt i32 %qval, 0 + br i1 %cmp2, label %if.then2, label %if.else2 + +if.then2: + tail call void @otherfn1() + br label %for.inc + +if.else2: + tail call void @otherfn2() + br label %for.inc + +for.inc: + %idx.next = add nuw nsw i64 %idx, 1 + %cmp3 = icmp eq i64 %idx.next, %n + br i1 %cmp3, label %exit, label %for.body + +exit: + ret void +} + +declare dso_local void @otherfn1() +declare dso_local void @otherfn2()