From bf11f2977ab3c311ad610533ffff7eb004eaaee1 Mon Sep 17 00:00:00 2001 From: nasmnc01 Date: Sun, 2 Nov 2025 17:09:59 +0000 Subject: [PATCH 1/5] [LoopFlatten] Add option to version loops instead of widening IVs LoopFlatten can sometimes generate loops like the following ``` vector.body: %index = phi i64 [ 0, %entry], [ %index.next, %vector.body ] %and = and i64 %index, 4294967295 %index.next = add i64 %and, 1 %exit.cond = icmp ugt i64 %index.next, %N br i1 %exit.cond, label %end, label %vector.body ``` The AND mask instruction is introduced due to LoopFlatten. To enable flattening a loop this pass attempts to widen induction variables to compute the new trip count. If widening is successful it introduces the AND mask instruction to check that the new widened IV doesn't overflow the original type width. This behaviour avoids runtime checks on the IV, but can slow down loop code considerably in some cases due to reducing the effectiveness of auto-vectorization. This patch introduces the -loop-flatten-version-over-widen flag to the LoopFlatten pass. This optional flag when enabled attempts to version the original loop, introducing a runtime check on whether the IV overflows, instead of widening. We find that this flag when enabled with other loop-nest-optimization and loop-vectorization flags can improve performance on internal autovectorization workloads by up to 23% for AArch64. Change-Id: I94572e65411cfeca3f617c60148f1c02500ab056 --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 72 +++-- .../LoopFlatten/loop-flatten-version.ll | 259 ++++++++++++++++++ 2 files changed, 305 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 04039b885f3c5..05d414811eabe 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -102,6 +102,10 @@ static cl::opt VersionLoops("loop-flatten-version-loops", cl::Hidden, cl::init(true), cl::desc("Version loops if flattened loop could overflow")); +static cl::opt VersionLoopsOverWiden( + "loop-flatten-version-over-widen", cl::Hidden, cl::init(false), + cl::desc("Version loops and generate runtime checks over widening the IV")); + namespace { // We require all uses of both induction variables to match this pattern: // @@ -835,14 +839,52 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return true; } +static bool VersionLoop(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, const LoopAccessInfo &LAI) { + + // Version the loop. The overflow check isn't a runtime pointer check, so we + // pass an empty list of runtime pointer checks, causing LoopVersioning to + // emit 'false' as the branch condition, and add our own check afterwards. + BasicBlock *CheckBlock = FI.OuterLoop->getLoopPreheader(); + ArrayRef Checks(nullptr, nullptr); + LoopVersioning LVer(LAI, Checks, FI.OuterLoop, LI, DT, SE); + LVer.versionLoop(); + + // Check for overflow by calculating the new tripcount using + // umul_with_overflow and then checking if it overflowed. + BranchInst *Br = cast(CheckBlock->getTerminator()); + if (!Br->isConditional()) + return false; + if (!match(Br->getCondition(), m_Zero())) + return false; + IRBuilder<> Builder(Br); + Value *Call = Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, + FI.OuterTripCount->getType(), + {FI.OuterTripCount, FI.InnerTripCount}, + /*FMFSource=*/nullptr, "flatten.mul"); + FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); + Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow"); + Br->setCondition(Overflow); + return true; +} + static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + const LoopAccessInfo &LAI) { if (!WidenIV) { LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n"); return false; } + // TODO: don't bother widening IV's if know that they + // can't overflow. If they can overflow opt for versioning + // the loop and remove requirement to truncate when using + // IV in the loop + if (VersionLoopsOverWiden) + if (VersionLoop(FI, DT, LI, SE, LAI)) + return true; + LLVM_DEBUG(dbgs() << "Try widening the IVs\n"); Module *M = FI.InnerLoop->getHeader()->getParent()->getParent(); auto &DL = M->getDataLayout(); @@ -916,7 +958,8 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; // Check if we can widen the induction variables to avoid overflow checks. - bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI); + // TODO: widening doesn't remove overflow checks in practice + bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI, LAI); // It can happen that after widening of the IV, flattening may not be // possible/happening, e.g. when it is deemed unprofitable. So bail here if @@ -961,30 +1004,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; } LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n"); - - // Version the loop. The overflow check isn't a runtime pointer check, so we - // pass an empty list of runtime pointer checks, causing LoopVersioning to - // emit 'false' as the branch condition, and add our own check afterwards. - BasicBlock *CheckBlock = FI.OuterLoop->getLoopPreheader(); - ArrayRef Checks(nullptr, nullptr); - LoopVersioning LVer(LAI, Checks, FI.OuterLoop, LI, DT, SE); - LVer.versionLoop(); - - // Check for overflow by calculating the new tripcount using - // umul_with_overflow and then checking if it overflowed. - BranchInst *Br = cast(CheckBlock->getTerminator()); - assert(Br->isConditional() && - "Expected LoopVersioning to generate a conditional branch"); - assert(match(Br->getCondition(), m_Zero()) && - "Expected branch condition to be false"); - IRBuilder<> Builder(Br); - Value *Call = Builder.CreateIntrinsic( - Intrinsic::umul_with_overflow, FI.OuterTripCount->getType(), - {FI.OuterTripCount, FI.InnerTripCount}, - /*FMFSource=*/nullptr, "flatten.mul"); - FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); - Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow"); - Br->setCondition(Overflow); + assert(VersionLoop(FI, DT, LI, SE, LAI) && "Failed to version loop"); } else { LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); } diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll index 85072bf3a43f4..1de31d2c7c70d 100644 --- a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll +++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt %s -S -passes='loop(loop-flatten),verify' -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s +; RUN: opt %s -S -passes='loop(loop-flatten),verify' -loop-flatten-version-over-widen -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s --check-prefix=CHECK-VERSION-OVER-WIDEN target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" @@ -61,6 +62,62 @@ define void @noinbounds_gep(i32 %N, ptr %A) { ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-VERSION-OVER-WIDEN-LABEL: define void @noinbounds_gep( +; CHECK-VERSION-OVER-WIDEN-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) { +; CHECK-VERSION-OVER-WIDEN-NEXT: entry: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP3:%.*]] = icmp ult i32 0, [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_LVER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.check: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[N]], i32 [[N]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_INNER_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_INNER_PREHEADER_PH:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH_LVER_ORIG]] ], [ [[INC2_LVER_ORIG:%.*]], [[FOR_OUTER_LVER_ORIG:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_LVER_ORIG]] ], [ [[INC1_LVER_ORIG:%.*]], [[FOR_INNER_LVER_ORIG]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i32 [[I_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP_LVER_ORIG:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr i32, ptr [[GEP_LVER_ORIG]], i32 [[J_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[ARRAYIDX_LVER_ORIG]], align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1_LVER_ORIG]] = add nuw i32 [[J_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i32 [[INC1_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_INNER_LVER_ORIG]], label [[FOR_OUTER_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN: for.outer.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2_LVER_ORIG]] = add i32 [[I_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1_LVER_ORIG:%.*]] = icmp ult i32 [[INC2_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1_LVER_ORIG]], label [[FOR_INNER_PREHEADER_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_ARRAYIDX:%.*]] = getelementptr i32, ptr [[A]], i32 [[I]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL:%.*]] = mul i32 [[I]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i32 [[J]] +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[FLATTEN_ARRAYIDX]], align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1:%.*]] = add nuw i32 [[J]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC1]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_OUTER]] +; CHECK-VERSION-OVER-WIDEN: for.outer: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2]] = add i32 [[I]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1:%.*]] = icmp ult i32 [[INC2]], [[FLATTEN_TRIPCOUNT]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT_LOOPEXIT1:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit1: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END]] +; CHECK-VERSION-OVER-WIDEN: for.end: +; CHECK-VERSION-OVER-WIDEN-NEXT: ret void +; entry: %cmp3 = icmp ult i32 0, %N br i1 %cmp3, label %for.outer.preheader, label %for.end @@ -124,6 +181,62 @@ define void @noinbounds_gep_too_large_mul(i64 %N, ptr %A) { ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-VERSION-OVER-WIDEN-LABEL: define void @noinbounds_gep_too_large_mul( +; CHECK-VERSION-OVER-WIDEN-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) { +; CHECK-VERSION-OVER-WIDEN-NEXT: entry: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP3:%.*]] = icmp ult i64 0, [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_LVER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.check: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[N]], i64 [[N]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i64, i1 } [[FLATTEN_MUL]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[FLATTEN_MUL]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_INNER_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_INNER_PREHEADER_PH:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_PH_LVER_ORIG]] ], [ [[INC2_LVER_ORIG:%.*]], [[FOR_OUTER_LVER_ORIG:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_LVER_ORIG]] ], [ [[INC1_LVER_ORIG:%.*]], [[FOR_INNER_LVER_ORIG]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i64 [[I_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP_LVER_ORIG:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr i32, ptr [[GEP_LVER_ORIG]], i64 [[J_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[ARRAYIDX_LVER_ORIG]], align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1_LVER_ORIG]] = add nuw i64 [[J_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i64 [[INC1_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_INNER_LVER_ORIG]], label [[FOR_OUTER_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN: for.outer.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2_LVER_ORIG]] = add i64 [[I_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1_LVER_ORIG:%.*]] = icmp ult i64 [[INC2_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1_LVER_ORIG]], label [[FOR_INNER_PREHEADER_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner.preheader: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_PH]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_ARRAYIDX:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.inner: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL:%.*]] = mul i64 [[I]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i64 [[J]] +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[FLATTEN_ARRAYIDX]], align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1:%.*]] = add nuw i64 [[J]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INC1]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_OUTER]] +; CHECK-VERSION-OVER-WIDEN: for.outer: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2]] = add i64 [[I]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INC2]], [[FLATTEN_TRIPCOUNT]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT_LOOPEXIT1:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit1: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK-VERSION-OVER-WIDEN: for.end.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END]] +; CHECK-VERSION-OVER-WIDEN: for.end: +; CHECK-VERSION-OVER-WIDEN-NEXT: ret void +; entry: %cmp3 = icmp ult i64 0, %N br i1 %cmp3, label %for.outer.preheader, label %for.end @@ -238,6 +351,79 @@ define void @d3_2(ptr %A, i32 %N, i32 %M) { ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-VERSION-OVER-WIDEN-LABEL: define void @d3_2( +; CHECK-VERSION-OVER-WIDEN-SAME: ptr [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { +; CHECK-VERSION-OVER-WIDEN-NEXT: entry: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP30]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.lr.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP625:%.*]] = icmp sgt i32 [[M]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.us: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[K_031_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP625]], label [[FOR_COND5_PREHEADER_US_US_LVER_CHECK:%.*]], label [[FOR_COND5_PREHEADER_US43_PREHEADER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us43.preheader: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT50:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us.us.lver.check: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[N]], i32 [[M]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_COND5_PREHEADER_US_US_PH_LVER_ORIG:%.*]], label [[FOR_COND5_PREHEADER_US_US_PH:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us.us.ph.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND5_PREHEADER_US_US_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us.us.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_028_US_US_LVER_ORIG:%.*]] = phi i32 [ [[INC10_US_US_LVER_ORIG:%.*]], [[FOR_COND5_FOR_COND_CLEANUP7_CRIT_EDGE_US_US_LVER_ORIG:%.*]] ], [ 0, [[FOR_COND5_PREHEADER_US_US_PH_LVER_ORIG]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_US_US_LVER_ORIG:%.*]] = mul nsw i32 [[I_028_US_US_LVER_ORIG]], [[M]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_BODY8_US_US_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.body8.us.us.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_026_US_US_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_COND5_PREHEADER_US_US_LVER_ORIG]] ], [ [[INC_US_US_LVER_ORIG:%.*]], [[FOR_BODY8_US_US_LVER_ORIG]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ADD_US_US_LVER_ORIG:%.*]] = add nsw i32 [[J_026_US_US_LVER_ORIG]], [[MUL_US_US_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[IDXPROM_US_US_LVER_ORIG:%.*]] = sext i32 [[ADD_US_US_LVER_ORIG]] to i64 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_US_US_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM_US_US_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void @f(ptr [[ARRAYIDX_US_US_LVER_ORIG]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC_US_US_LVER_ORIG]] = add nuw nsw i32 [[J_026_US_US_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[EXITCOND_LVER_ORIG:%.*]] = icmp ne i32 [[INC_US_US_LVER_ORIG]], [[M]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[EXITCOND_LVER_ORIG]], label [[FOR_BODY8_US_US_LVER_ORIG]], label [[FOR_COND5_FOR_COND_CLEANUP7_CRIT_EDGE_US_US_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.for.cond.cleanup7_crit_edge.us.us.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC10_US_US_LVER_ORIG]] = add nuw nsw i32 [[I_028_US_US_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[EXITCOND51_LVER_ORIG:%.*]] = icmp ne i32 [[INC10_US_US_LVER_ORIG]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[EXITCOND51_LVER_ORIG]], label [[FOR_COND5_PREHEADER_US_US_LVER_ORIG]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us.us.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND5_PREHEADER_US_US:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.for.cond.cleanup3_crit_edge.us.loopexit.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.for.cond.cleanup3_crit_edge.us.loopexit.loopexit1: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.for.cond.cleanup3_crit_edge.us.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.for.cond.cleanup3_crit_edge.us.loopexit50: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.for.cond.cleanup3_crit_edge.us: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC13_US]] = add nuw nsw i32 [[K_031_US]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[EXITCOND52:%.*]] = icmp ne i32 [[INC13_US]], [[N]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[EXITCOND52]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.preheader.us.us: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_028_US_US:%.*]] = phi i32 [ [[INC10_US_US:%.*]], [[FOR_COND5_FOR_COND_CLEANUP7_CRIT_EDGE_US_US:%.*]] ], [ 0, [[FOR_COND5_PREHEADER_US_US_PH]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_US_US:%.*]] = mul nsw i32 [[I_028_US_US]], [[M]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_BODY8_US_US:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond5.for.cond.cleanup7_crit_edge.us.us: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC10_US_US]] = add nuw nsw i32 [[I_028_US_US]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[EXITCOND51:%.*]] = icmp ne i32 [[INC10_US_US]], [[FLATTEN_TRIPCOUNT]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[EXITCOND51]], label [[FOR_COND5_PREHEADER_US_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT_LOOPEXIT1:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.body8.us.us: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_026_US_US:%.*]] = phi i32 [ 0, [[FOR_COND5_PREHEADER_US_US]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ADD_US_US:%.*]] = add nsw i32 [[J_026_US_US]], [[MUL_US_US]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[IDXPROM_US_US:%.*]] = sext i32 [[I_028_US_US]] to i64 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_US_US:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM_US_US]] +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void @f(ptr [[ARRAYIDX_US_US]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC_US_US:%.*]] = add nuw nsw i32 [[J_026_US_US]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC_US_US]], [[M]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND5_FOR_COND_CLEANUP7_CRIT_EDGE_US_US]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup: +; CHECK-VERSION-OVER-WIDEN-NEXT: ret void +; entry: %cmp30 = icmp sgt i32 %N, 0 br i1 %cmp30, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup @@ -371,6 +557,79 @@ define void @overflow(i32 %lim, ptr %a) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[J_016]], 99999 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] ; +; CHECK-VERSION-OVER-WIDEN-LABEL: define void @overflow( +; CHECK-VERSION-OVER-WIDEN-SAME: i32 [[LIM:%.*]], ptr [[A:%.*]]) { +; CHECK-VERSION-OVER-WIDEN-NEXT: entry: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP17_NOT:%.*]] = icmp eq i32 [[LIM]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LVER_CHECK:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.lver.check: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[LIM]], i32 100000) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_COND1_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_COND1_PREHEADER_PH:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.ph.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_PREHEADER_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_018_LVER_ORIG:%.*]] = phi i32 [ [[INC6_LVER_ORIG:%.*]], [[FOR_COND_CLEANUP3_LVER_ORIG:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PH_LVER_ORIG]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i32 [[I_018_LVER_ORIG]], 100000 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_BODY4_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.body4.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_016_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ADD_LVER_ORIG:%.*]] = add i32 [[J_016_LVER_ORIG]], [[MUL_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TMP0:%.*]] = load i32, ptr @first, align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[IF_END_LVER_ORIG]], label [[IF_THEN_LVER_ORIG:%.*]] +; CHECK-VERSION-OVER-WIDEN: if.then.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr inbounds [0 x i8], ptr @a, i32 0, i32 [[ADD_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX_LVER_ORIG]], align 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void asm sideeffect "", "r"(i8 [[TMP1]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr @first, align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[IF_END_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN: if.end.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void asm sideeffect "", "r"(i32 [[ADD_LVER_ORIG]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC_LVER_ORIG]] = add nuw nsw i32 [[J_016_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i32 [[J_016_LVER_ORIG]], 99999 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_BODY4_LVER_ORIG]], label [[FOR_COND_CLEANUP3_LVER_ORIG]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup3.lver.orig: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC6_LVER_ORIG]] = add i32 [[I_018_LVER_ORIG]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP_LVER_ORIG:%.*]] = icmp ult i32 [[INC6_LVER_ORIG]], [[LIM]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP_LVER_ORIG]], label [[FOR_COND1_PREHEADER_LVER_ORIG]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond1.preheader: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_018:%.*]] = phi i32 [ [[INC6:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PH]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL:%.*]] = mul i32 [[I_018]], 100000 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup.loopexit.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup.loopexit.loopexit1: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup.loopexit: +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup: +; CHECK-VERSION-OVER-WIDEN-NEXT: ret void +; CHECK-VERSION-OVER-WIDEN: for.cond.cleanup3: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC6]] = add i32 [[I_018]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC6]], [[FLATTEN_TRIPCOUNT]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT1:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.body4: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_016:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ADD:%.*]] = add i32 [[J_016]], [[MUL]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TMP2:%.*]] = load i32, ptr @first, align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK-VERSION-OVER-WIDEN: if.then: +; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr @a, i32 0, i32 [[I_018]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void asm sideeffect "", "r"(i8 [[TMP3]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr @first, align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[IF_END]] +; CHECK-VERSION-OVER-WIDEN: if.end: +; CHECK-VERSION-OVER-WIDEN-NEXT: tail call void asm sideeffect "", "r"(i32 [[I_018]]) +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC:%.*]] = add nuw nsw i32 [[J_016]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2:%.*]] = icmp ult i32 [[J_016]], 99999 +; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_COND_CLEANUP3]] +; entry: %cmp17.not = icmp eq i32 %lim, 0 br i1 %cmp17.not, label %for.cond.cleanup, label %for.cond1.preheader.preheader From 0113265b70360d73d2c3518eaab6fe56048ea535 Mon Sep 17 00:00:00 2001 From: nasmnc01 Date: Mon, 3 Nov 2025 11:57:42 +0000 Subject: [PATCH 2/5] Review comments from copilot Change-Id: I7976117394354ec20fbd4398a245c06f73d64a41 --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 05d414811eabe..b0e044a61739b 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -852,8 +852,8 @@ static bool VersionLoop(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // Check for overflow by calculating the new tripcount using // umul_with_overflow and then checking if it overflowed. - BranchInst *Br = cast(CheckBlock->getTerminator()); - if (!Br->isConditional()) + BranchInst *Br = dyn_cast(CheckBlock->getTerminator()); + if (!Br || !Br->isConditional()) return false; if (!match(Br->getCondition(), m_Zero())) return false; @@ -877,13 +877,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; } - // TODO: don't bother widening IV's if know that they - // can't overflow. If they can overflow opt for versioning - // the loop and remove requirement to truncate when using - // IV in the loop - if (VersionLoopsOverWiden) - if (VersionLoop(FI, DT, LI, SE, LAI)) - return true; + if (VersionLoopsOverWiden && VersionLoop(FI, DT, LI, SE, LAI)) + return true; LLVM_DEBUG(dbgs() << "Try widening the IVs\n"); Module *M = FI.InnerLoop->getHeader()->getParent()->getParent(); @@ -958,7 +953,6 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; // Check if we can widen the induction variables to avoid overflow checks. - // TODO: widening doesn't remove overflow checks in practice bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI, LAI); // It can happen that after widening of the IV, flattening may not be From 6b61d6e2dbd2257d659e844485b97e016f0ba12e Mon Sep 17 00:00:00 2001 From: nasmnc01 Date: Mon, 3 Nov 2025 12:59:46 +0000 Subject: [PATCH 3/5] Test typo Change-Id: I996ba838554494fe750fb0824844098d21ea8a52 --- llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll index 1de31d2c7c70d..bdaa889fdb0cd 100644 --- a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll +++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt %s -S -passes='loop(loop-flatten),verify' -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s -; RUN: opt %s -S -passes='loop(loop-flatten),verify' -loop-flatten-version-over-widen -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s --check-prefix=CHECK-VERSION-OVER-WIDEN +; RUN: opt %s -S -passes='loop(loop-flatten),verify' -loop-flatten-version-over-widen=true -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s --check-prefix=CHECK-VERSION-OVER-WIDEN target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" From 66c887e282a2b1703dbd82479da67f913be2d2f1 Mon Sep 17 00:00:00 2001 From: nasmnc01 Date: Mon, 10 Nov 2025 14:13:52 +0000 Subject: [PATCH 4/5] Review comments Make patch an NFC refactor Change-Id: I0155ae8e31ebf1e2f30cca89e201449a926dc192 --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index b0e044a61739b..5d98bcd4927f6 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -102,10 +102,6 @@ static cl::opt VersionLoops("loop-flatten-version-loops", cl::Hidden, cl::init(true), cl::desc("Version loops if flattened loop could overflow")); -static cl::opt VersionLoopsOverWiden( - "loop-flatten-version-over-widen", cl::Hidden, cl::init(false), - cl::desc("Version loops and generate runtime checks over widening the IV")); - namespace { // We require all uses of both induction variables to match this pattern: // @@ -877,9 +873,6 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; } - if (VersionLoopsOverWiden && VersionLoop(FI, DT, LI, SE, LAI)) - return true; - LLVM_DEBUG(dbgs() << "Try widening the IVs\n"); Module *M = FI.InnerLoop->getHeader()->getParent()->getParent(); auto &DL = M->getDataLayout(); @@ -998,7 +991,8 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, return false; } LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n"); - assert(VersionLoop(FI, DT, LI, SE, LAI) && "Failed to version loop"); + bool LoopIsVersioned = VersionLoop(FI, DT, LI, SE, LAI); + assert(LoopIsVersioned && "Failed to version loop"); } else { LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); } From e08aa7e0fb36205e1adf1e19d6577502d0ec3ced Mon Sep 17 00:00:00 2001 From: nasmnc01 Date: Tue, 11 Nov 2025 09:40:25 +0000 Subject: [PATCH 5/5] Update test Change-Id: Ibea6a2e2a18c080e20dced1a9d5ffd7f57b122e4 --- .../LoopFlatten/loop-flatten-version.ll | 44 ++++--------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll index bdaa889fdb0cd..0842f049feb11 100644 --- a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll +++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt %s -S -passes='loop(loop-flatten),verify' -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s -; RUN: opt %s -S -passes='loop(loop-flatten),verify' -loop-flatten-version-over-widen=true -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s --check-prefix=CHECK-VERSION-OVER-WIDEN +; RUN: opt %s -S -passes='loop(loop-flatten),verify' -loop-flatten-widen-iv=false -loop-flatten-version-loops=true -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s --check-prefix=CHECK-VERSION-OVER-WIDEN target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" @@ -185,53 +185,25 @@ define void @noinbounds_gep_too_large_mul(i64 %N, ptr %A) { ; CHECK-VERSION-OVER-WIDEN-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) { ; CHECK-VERSION-OVER-WIDEN-NEXT: entry: ; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP3:%.*]] = icmp ult i64 0, [[N]] -; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_LVER_CHECK:%.*]], label [[FOR_END:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.check: -; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[N]], i64 [[N]]) -; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i64, i1 } [[FLATTEN_MUL]], 0 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[FLATTEN_MUL]], 1 -; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_INNER_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_INNER_PREHEADER_PH:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph.lver.orig: -; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER_LVER_ORIG:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.lver.orig: -; CHECK-VERSION-OVER-WIDEN-NEXT: [[I_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_PH_LVER_ORIG]] ], [ [[INC2_LVER_ORIG:%.*]], [[FOR_OUTER_LVER_ORIG:%.*]] ] -; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_LVER_ORIG:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.inner.lver.orig: -; CHECK-VERSION-OVER-WIDEN-NEXT: [[J_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_LVER_ORIG]] ], [ [[INC1_LVER_ORIG:%.*]], [[FOR_INNER_LVER_ORIG]] ] -; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i64 [[I_LVER_ORIG]], [[N]] -; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP_LVER_ORIG:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL_LVER_ORIG]] -; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr i32, ptr [[GEP_LVER_ORIG]], i64 [[J_LVER_ORIG]] -; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[ARRAYIDX_LVER_ORIG]], align 4 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1_LVER_ORIG]] = add nuw i64 [[J_LVER_ORIG]], 1 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i64 [[INC1_LVER_ORIG]], [[N]] -; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_INNER_LVER_ORIG]], label [[FOR_OUTER_LVER_ORIG]] -; CHECK-VERSION-OVER-WIDEN: for.outer.lver.orig: -; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2_LVER_ORIG]] = add i64 [[I_LVER_ORIG]], 1 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1_LVER_ORIG:%.*]] = icmp ult i64 [[INC2_LVER_ORIG]], [[N]] -; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1_LVER_ORIG]], label [[FOR_INNER_PREHEADER_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.inner.preheader.ph: +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_PH:%.*]], label [[FOR_END:%.*]] +; CHECK-VERSION-OVER-WIDEN: for.outer.preheader: ; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER_PREHEADER:%.*]] ; CHECK-VERSION-OVER-WIDEN: for.inner.preheader: ; CHECK-VERSION-OVER-WIDEN-NEXT: [[I:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER_PH]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ] -; CHECK-VERSION-OVER-WIDEN-NEXT: [[FLATTEN_ARRAYIDX:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]] ; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_INNER:%.*]] ; CHECK-VERSION-OVER-WIDEN: for.inner: -; CHECK-VERSION-OVER-WIDEN-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER]] ] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER]] ], [ [[INC1:%.*]], [[FOR_INNER]] ] ; CHECK-VERSION-OVER-WIDEN-NEXT: [[MUL:%.*]] = mul i64 [[I]], [[N]] ; CHECK-VERSION-OVER-WIDEN-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL]] ; CHECK-VERSION-OVER-WIDEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i64 [[J]] -; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[FLATTEN_ARRAYIDX]], align 4 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1:%.*]] = add nuw i64 [[J]], 1 +; CHECK-VERSION-OVER-WIDEN-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC1]] = add nuw i64 [[J]], 1 ; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INC1]], [[N]] -; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_OUTER]] +; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP2]], label [[FOR_INNER]], label [[FOR_OUTER]] ; CHECK-VERSION-OVER-WIDEN: for.outer: ; CHECK-VERSION-OVER-WIDEN-NEXT: [[INC2]] = add i64 [[I]], 1 -; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INC2]], [[FLATTEN_TRIPCOUNT]] +; CHECK-VERSION-OVER-WIDEN-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INC2]], [[N]] ; CHECK-VERSION-OVER-WIDEN-NEXT: br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT_LOOPEXIT1:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit: -; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] -; CHECK-VERSION-OVER-WIDEN: for.end.loopexit.loopexit1: -; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END_LOOPEXIT]] ; CHECK-VERSION-OVER-WIDEN: for.end.loopexit: ; CHECK-VERSION-OVER-WIDEN-NEXT: br label [[FOR_END]] ; CHECK-VERSION-OVER-WIDEN: for.end: