From e72522a79fde564ed7badd7b833281c7f4f38197 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Fri, 7 Nov 2025 06:23:51 -0800 Subject: [PATCH 1/8] [LoopInterchange] Don't consider loops with BTC=0 Do not consider loops with a zero backedge taken count as candidates for interchange. This seems like a sensible thing to do to me, because it suggests the loop doesn't execute and there is no point in interchanging. This avoids triggering an assert about phis and their uses. I have a feeling that this fix might be hiding the issue, but I haven't yet been able to trigger the assert with other test cases; every time the loops are rejected for other reasons. Since I think this is a self-contained improvement that avoids a lot of test failures, I propose to reject this loops while I investigate further if I can still trigger this in some way. (Partial) fix for #163954 --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 13 +++++++++++++ .../LoopInterchange/interchanged-loop-nest-4.ll | 2 +- .../LoopInterchange/lcssa-phi-outer-latch.ll | 2 +- .../LoopInterchange/pr43176-move-to-new-latch.ll | 2 +- llvm/test/Transforms/LoopInterchange/pr43326.ll | 2 +- llvm/test/Transforms/LoopInterchange/pr57148.ll | 2 +- .../reductions-across-inner-and-outer-loop.ll | 2 +- 7 files changed, 19 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9aaf6a5aa4d6a..776560e678d8e 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -101,6 +101,12 @@ static cl::opt MaxLoopNestDepth( "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform")); +// This is mainly for testing purposes, and certain tests that rely on +// behaviour that is more difficult to trigger otherwise. +static cl::opt SkipLoopsWithZeroBTC( + "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, + cl::desc("Do not consider loops with a backedge taken count of 0")); + // We prefer cache cost to vectorization by default. static cl::list Profitabilities( "loop-interchange-profitabilities", cl::ZeroOrMore, @@ -428,6 +434,13 @@ static bool isComputableLoopNest(ScalarEvolution *SE, LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); return false; } + // A loop with a backedge that isn't taken, e.g. an unconditional branch + // true, isn't really a loop and we don't want to consider it as a + // candidate. + if (ExitCountOuter && SkipLoopsWithZeroBTC && ExitCountOuter->isZero()) { + LLVM_DEBUG(dbgs() << "Single iteration loop\n"); + return false; + } if (L->getNumBackEdges() != 1) { LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); return false; diff --git a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll index f61054409937c..bbd554c8e47fb 100644 --- a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll +++ b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes="loop(loop-interchange,loop-interchange)" -cache-line-size=8 -verify-dom-info -verify-loop-info \ +; RUN: opt < %s -passes="loop(loop-interchange,loop-interchange)" -cache-line-size=8 -verify-dom-info -verify-loop-info -loop-interchange-skip-zero-btc=false \ ; RUN: -debug-only=loop-interchange 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll index 92ce3288b4529..1a1780addb708 100644 --- a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -loop-interchange-skip-zero-btc=false -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s ; This test is checking that blocks outer.body and outer.latch, where outer.body is the exit ; block of the inner loop and outer.latch the latch of the outer loop, correctly diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll index f02ee1a0ced19..8968350c51788 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S +; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -loop-interchange-skip-zero-btc=false -S ; RUN: FileCheck --input-file=%t %s @b = external dso_local global [5 x i32], align 16 diff --git a/llvm/test/Transforms/LoopInterchange/pr43326.ll b/llvm/test/Transforms/LoopInterchange/pr43326.ll index cc4f07c722dd9..f75d377755ab7 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43326.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43326.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 +; RUN: -loop-interchange-skip-zero-btc=false -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s @a = global i32 0 diff --git a/llvm/test/Transforms/LoopInterchange/pr57148.ll b/llvm/test/Transforms/LoopInterchange/pr57148.ll index 747dbbcb4a44e..06dbc97efa3aa 100644 --- a/llvm/test/Transforms/LoopInterchange/pr57148.ll +++ b/llvm/test/Transforms/LoopInterchange/pr57148.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -loop-interchange-skip-zero-btc=false -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s ; Make sure the loops are in LCSSA form after loop interchange, ; and loop interchange does not hit assertion errors and crash. diff --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll index 51fda4cf1ebe1..af9e5f7e58bc4 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s +; RUN: -loop-interchange-skip-zero-btc=false -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s From f51d9f643a8f8a5012c1691f4fb129df759707a3 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Sat, 8 Nov 2025 01:26:23 -0800 Subject: [PATCH 2/8] Forgto to add test-case --- .../Transforms/LoopInterchange/zero-btc.ll | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 llvm/test/Transforms/LoopInterchange/zero-btc.ll diff --git a/llvm/test/Transforms/LoopInterchange/zero-btc.ll b/llvm/test/Transforms/LoopInterchange/zero-btc.ll new file mode 100644 index 0000000000000..58d794e5def40 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/zero-btc.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=loop-interchange -loop-interchange-profitabilities=ignore -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + +; Test case for issue: https://github.com/llvm/llvm-project/issues/163954 + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[I:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[DOTLCSSA:%.*]], %[[OUTER_LATCH:.*]] ] +; CHECK-NEXT: br label %[[INNER_HEADER:.*]] +; CHECK: [[INNER_HEADER]]: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[J_NEXT:%.*]], %[[INNER_LATCH:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i8 [ [[I]], %[[OUTER_HEADER]] ], [ [[TMP1:%.*]], %[[INNER_LATCH]] ] +; CHECK-NEXT: br label %[[INNER_BODY:.*]] +; CHECK: [[INNER_BODY]]: +; CHECK-NEXT: br i1 true, label %[[INNER_LATCH]], label %[[INNER_BODY]] +; CHECK: [[INNER_LATCH]]: +; CHECK-NEXT: [[TMP1]] = or i8 [[TMP0]], 0 +; CHECK-NEXT: [[J_NEXT]] = add i64 [[J]], 1 +; CHECK-NEXT: br i1 true, label %[[OUTER_LATCH]], label %[[INNER_HEADER]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[DOTLCSSA]] = phi i8 [ [[TMP1]], %[[INNER_LATCH]] ] +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[OUTER_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: + %i = phi i8 [ 0, %entry ], [ %1, %outer.latch ] + br label %inner.header + +inner.header: + %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.latch ] + %0 = phi i8 [ %i, %outer.header ], [ %1, %inner.latch ] + br label %inner.body + +inner.body: + br i1 true, label %inner.latch, label %inner.body ; another (self) loop, but never taken + +inner.latch: + %1 = or i8 %0, 0 + %j.next = add i64 %j, 1 + br i1 true, label %outer.latch, label %inner.header + +outer.latch: + br i1 true, label %exit, label %outer.header + +exit: + ret void +} From e3abad5993e222c6f2f8bdc674478f76cced67bc Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Mon, 10 Nov 2025 03:44:40 -0800 Subject: [PATCH 3/8] Removed datalayout from test, improved debug message, and clarified comment. --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 16 +++++++++------- llvm/test/Transforms/LoopInterchange/zero-btc.ll | 2 -- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 776560e678d8e..353de3aacd4be 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -101,12 +101,6 @@ static cl::opt MaxLoopNestDepth( "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform")); -// This is mainly for testing purposes, and certain tests that rely on -// behaviour that is more difficult to trigger otherwise. -static cl::opt SkipLoopsWithZeroBTC( - "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, - cl::desc("Do not consider loops with a backedge taken count of 0")); - // We prefer cache cost to vectorization by default. static cl::list Profitabilities( "loop-interchange-profitabilities", cl::ZeroOrMore, @@ -126,6 +120,13 @@ static cl::list Profitabilities( "Ignore profitability, force interchange (does not " "work with other options)"))); +// FIXME: this option exists mainly for a couple of tests that check some +// corner cases that is more difficult to trigger otherwise; these tests should +// be rewritten and this option removed if possible. +static cl::opt SkipLoopsWithZeroBTC( + "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, + cl::desc("Do not consider loops with a backedge taken count of 0")); + #ifndef NDEBUG static bool noDuplicateRulesAndIgnore(ArrayRef Rules) { SmallSet Set; @@ -438,7 +439,8 @@ static bool isComputableLoopNest(ScalarEvolution *SE, // true, isn't really a loop and we don't want to consider it as a // candidate. if (ExitCountOuter && SkipLoopsWithZeroBTC && ExitCountOuter->isZero()) { - LLVM_DEBUG(dbgs() << "Single iteration loop\n"); + LLVM_DEBUG(dbgs() << "The loop back-edge isn't taken, rejecting single " + "iteration loop\n"); return false; } if (L->getNumBackEdges() != 1) { diff --git a/llvm/test/Transforms/LoopInterchange/zero-btc.ll b/llvm/test/Transforms/LoopInterchange/zero-btc.ll index 58d794e5def40..af2aa72adcb94 100644 --- a/llvm/test/Transforms/LoopInterchange/zero-btc.ll +++ b/llvm/test/Transforms/LoopInterchange/zero-btc.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -passes=loop-interchange -loop-interchange-profitabilities=ignore -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" - ; Test case for issue: https://github.com/llvm/llvm-project/issues/163954 define void @test() { From 5d9326ab1c456f970153427a0418358a07170d5c Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Sat, 15 Nov 2025 01:00:08 -0800 Subject: [PATCH 4/8] TMP --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 9 +++++---- .../LoopInterchange/interchanged-loop-nest-4.ll | 2 +- .../Transforms/LoopInterchange/lcssa-phi-outer-latch.ll | 2 +- .../LoopInterchange/pr43176-move-to-new-latch.ll | 2 +- llvm/test/Transforms/LoopInterchange/pr43326.ll | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 353de3aacd4be..0b01f05f65866 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -123,9 +123,9 @@ static cl::list Profitabilities( // FIXME: this option exists mainly for a couple of tests that check some // corner cases that is more difficult to trigger otherwise; these tests should // be rewritten and this option removed if possible. -static cl::opt SkipLoopsWithZeroBTC( - "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, - cl::desc("Do not consider loops with a backedge taken count of 0")); +//static cl::opt SkipLoopsWithZeroBTC( +// "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, +// cl::desc("Do not consider loops with a backedge taken count of 0")); #ifndef NDEBUG static bool noDuplicateRulesAndIgnore(ArrayRef Rules) { @@ -438,9 +438,10 @@ static bool isComputableLoopNest(ScalarEvolution *SE, // A loop with a backedge that isn't taken, e.g. an unconditional branch // true, isn't really a loop and we don't want to consider it as a // candidate. - if (ExitCountOuter && SkipLoopsWithZeroBTC && ExitCountOuter->isZero()) { + if (ExitCountOuter && /*SkipLoopsWithZeroBTC && */ExitCountOuter->isZero()) { LLVM_DEBUG(dbgs() << "The loop back-edge isn't taken, rejecting single " "iteration loop\n"); + LLVM_DEBUG(L->dump()); return false; } if (L->getNumBackEdges() != 1) { diff --git a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll index bbd554c8e47fb..f61054409937c 100644 --- a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll +++ b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-4.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes="loop(loop-interchange,loop-interchange)" -cache-line-size=8 -verify-dom-info -verify-loop-info -loop-interchange-skip-zero-btc=false \ +; RUN: opt < %s -passes="loop(loop-interchange,loop-interchange)" -cache-line-size=8 -verify-dom-info -verify-loop-info \ ; RUN: -debug-only=loop-interchange 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll index 1a1780addb708..92ce3288b4529 100644 --- a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=loop-interchange -loop-interchange-skip-zero-btc=false -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s ; This test is checking that blocks outer.body and outer.latch, where outer.body is the exit ; block of the inner loop and outer.latch the latch of the outer loop, correctly diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll index 8968350c51788..f02ee1a0ced19 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -loop-interchange-skip-zero-btc=false -S +; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S ; RUN: FileCheck --input-file=%t %s @b = external dso_local global [5 x i32], align 16 diff --git a/llvm/test/Transforms/LoopInterchange/pr43326.ll b/llvm/test/Transforms/LoopInterchange/pr43326.ll index f75d377755ab7..cc4f07c722dd9 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43326.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43326.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -loop-interchange-skip-zero-btc=false -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s @a = global i32 0 From 56c3e9b1fc069e94ce7b7e57eb5f423e62f066a1 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Mon, 17 Nov 2025 09:36:00 -0800 Subject: [PATCH 5/8] Don't reject the whole loopnest, fixed up and added test case. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 60 +++++++++---- .../loopnest-with-outer-btc0.ll | 73 ++++++++++++++++ .../Transforms/LoopInterchange/pr43326.ll | 6 +- .../Transforms/LoopInterchange/pr57148.ll | 84 +++++++++++-------- .../reductions-across-inner-and-outer-loop.ll | 2 +- 5 files changed, 168 insertions(+), 57 deletions(-) create mode 100644 llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 0b01f05f65866..9b93339e1bedc 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -46,6 +46,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include +#include #include #include @@ -428,22 +429,15 @@ static bool hasSupportedLoopDepth(ArrayRef LoopList, } static bool isComputableLoopNest(ScalarEvolution *SE, - ArrayRef LoopList) { + ArrayRef LoopList, + std::map &LoopBTC) { for (Loop *L : LoopList) { const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); + LoopBTC[L] = ExitCountOuter; if (isa(ExitCountOuter)) { LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); return false; } - // A loop with a backedge that isn't taken, e.g. an unconditional branch - // true, isn't really a loop and we don't want to consider it as a - // candidate. - if (ExitCountOuter && /*SkipLoopsWithZeroBTC && */ExitCountOuter->isZero()) { - LLVM_DEBUG(dbgs() << "The loop back-edge isn't taken, rejecting single " - "iteration loop\n"); - LLVM_DEBUG(L->dump()); - return false; - } if (L->getNumBackEdges() != 1) { LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); return false; @@ -561,7 +555,8 @@ class LoopInterchangeProfitability { /// Check if the loop interchange is profitable. bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix, CacheCostManager &CCM); + CharMatrix &DepMatrix, CacheCostManager &CCM, + std::map &LoopBTC); private: int getInstrOrderCost(); @@ -618,14 +613,17 @@ struct LoopInterchange { DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; LoopStandardAnalysisResults *AR = nullptr; - /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; + // A cache to avoid recalculating the backedge-taken count for a loop. + std::map LoopBTC; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, DominatorTree *DT, LoopStandardAnalysisResults *AR, - OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {} + OptimizationRemarkEmitter *ORE, + std::map &&LoopBTC) + : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE), + LoopBTC(std::move(LoopBTC)) {} bool run(Loop *L) { if (L->getParentLoop()) @@ -717,7 +715,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CCM)) { + DependencyMatrix, CCM, LoopBTC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1477,7 +1475,30 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { + unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM, + std::map &LoopBTC) { + + auto *InnerBTC = LoopBTC[InnerLoop]; + auto *OuterBTC = LoopBTC[OuterLoop]; + assert(InnerBTC && OuterBTC && + "Loop BTC should exist in cache but not found"); + // A loop with a backedge that isn't taken, e.g. an unconditional branch + // true, isn't really a loop and we don't want to consider it as a + // candidate. + // TODO: when interchange is forced, we should probably also allow + // interchange for these loops, and thus this logic should be moved just + // below the cost-model ignore check below. But this check is done first + // to avoid the issue in #163954. + if (InnerBTC && InnerBTC->isZero()) { + LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting " + "single iteration loop\n"); + return false; + } + if (OuterBTC && OuterBTC->isZero()) { + LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting " + "single iteration loop\n"); + return false; + } // Return true if interchange is forced and the cost-model ignored. if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore) @@ -2114,6 +2135,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, LPMUpdater &U) { Function &F = *LN.getParent(); SmallVector LoopList(LN.getLoops()); + std::map LoopBTC; if (MaxMemInstrCount < 1) { LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1"); @@ -2125,7 +2147,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, if (!hasSupportedLoopDepth(LoopList, ORE)) return PreservedAnalyses::all(); // Ensure computable loop nest. - if (!isComputableLoopNest(&AR.SE, LoopList)) { + if (!isComputableLoopNest(&AR.SE, LoopList, LoopBTC)) { LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); return PreservedAnalyses::all(); } @@ -2138,7 +2160,9 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, }); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE, + std::move(LoopBTC)) + .run(LN)) return PreservedAnalyses::all(); U.markLoopNestChanged(true); return getLoopPassPreservedAnalyses(); diff --git a/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll b/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll new file mode 100644 index 0000000000000..ac3bbcf0511ef --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info \ +; RUN: -pass-remarks-output=%t -pass-remarks='loop-interchange' -S +; RUN: cat %t | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@D = common global [100 x [100 x [100 x i32]]] zeroinitializer + +; Test for interchange in +; +; for(int i=0;i<1;i++) +; for(int j=0;j<100;j++) +; for(int k=0;k<100;k++) +; D[i][k][j] = D[i][k][j]+t; +; + +; CHECK: --- !Analysis +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Function: interchange_i_and_j +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Computed dependence info, invoking the transform. +; CHECK-NEXT: ... +; CHECK-NEXT: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: interchange_i_and_j +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +; CHECK-NEXT: ... +; CHECK-NEXT: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: InterchangeNotProfitable +; CHECK-NEXT: Function: interchange_i_and_j +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange. +; CHECK-NEXT: ... + +define void @interchange_i_and_j(i32 %t){ +entry: + br label %outer.header + +outer.header: + %i = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ] + br label %inner1.header + +inner1.header: + %j = phi i64 [ 0, %outer.header ], [ %inc13, %for.inc12 ] + br label %inner2.body + +inner2.body: + %k = phi i64 [ 0, %inner1.header ], [ %inc, %inner2.body ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], ptr @D, i64 0, i64 %i, i64 %k, i64 %j + %0 = load i32, ptr %arrayidx8 + %add = add nsw i32 %0, %t + store i32 %add, ptr %arrayidx8 + %inc = add nuw nsw i64 %k, 1 + %exitcond = icmp eq i64 %inc, 100 + br i1 %exitcond, label %for.inc12, label %inner2.body + +for.inc12: + %inc13 = add nuw nsw i64 %j, 1 + %exitcond29 = icmp eq i64 %inc13, 100 + br i1 %exitcond29, label %for.inc15, label %inner1.header + +for.inc15: + %inc16 = add nuw nsw i64 %i, 1 + %exitcond30 = icmp eq i64 %inc16, 1 + br i1 %exitcond30, label %for.end17, label %outer.header + +for.end17: + ret void +} diff --git a/llvm/test/Transforms/LoopInterchange/pr43326.ll b/llvm/test/Transforms/LoopInterchange/pr43326.ll index cc4f07c722dd9..666f11d4969a0 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43326.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43326.ll @@ -64,7 +64,7 @@ for.end: ; preds = %for.inc for.inc10: ; preds = %for.end %j.next = add i8 %j, -1 - %cmp = icmp sgt i8 %j.next, -1 + %cmp = icmp sgt i8 %j.next, -10 br i1 %cmp, label %inner1.header, label %for.end11 for.end11: ; preds = %for.inc10 @@ -75,8 +75,8 @@ for.end11: ; preds = %for.inc10 for.inc12: ; preds = %for.end11 %inc13 = add nsw i32 %inc1312, 1 - %tobool.not = icmp eq i32 %inc13, 0 - br i1 %tobool.not, label %for.cond.for.end14_crit_edge, label %outer.header + %tobool.not = icmp slt i32 %inc13, 42 + br i1 %tobool.not, label %outer.header, label %for.cond.for.end14_crit_edge for.cond.for.end14_crit_edge: ; preds = %for.inc12 %inc13.lcssa = phi i32 [ %inc13, %for.inc12 ] diff --git a/llvm/test/Transforms/LoopInterchange/pr57148.ll b/llvm/test/Transforms/LoopInterchange/pr57148.ll index 06dbc97efa3aa..b5b1c0e265013 100644 --- a/llvm/test/Transforms/LoopInterchange/pr57148.ll +++ b/llvm/test/Transforms/LoopInterchange/pr57148.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-interchange -loop-interchange-skip-zero-btc=false -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s ; Make sure the loops are in LCSSA form after loop interchange, ; and loop interchange does not hit assertion errors and crash. @@ -35,16 +35,19 @@ define void @test1() { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX55]], align 1 ; CHECK-NEXT: [[ADD61:%.*]] = add i32 undef, undef ; CHECK-NEXT: [[INC63:%.*]] = add nsw i16 [[K_09]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[K_09]], 42 ; CHECK-NEXT: br label [[FOR_END67]] ; CHECK: for.body42.split: ; CHECK-NEXT: [[ADD61_LCSSA:%.*]] = phi i32 [ [[ADD61]], [[FOR_END67]] ] ; CHECK-NEXT: [[TMP1]] = add nsw i16 [[K_09]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_END64]], label [[FOR_BODY42]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i16 [[K_09]], 42 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY42]], label [[FOR_END64]] ; CHECK: for.end64: ; CHECK-NEXT: [[ADD61_LCSSA_LCSSA:%.*]] = phi i32 [ [[ADD61_LCSSA]], [[FOR_BODY42_SPLIT]] ] ; CHECK-NEXT: store i32 [[ADD61_LCSSA_LCSSA]], ptr undef, align 1 ; CHECK-NEXT: [[INC66]] = add nuw nsw i16 [[J_010]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_COND75_PREHEADER:%.*]], label [[FOR_COND37_PREHEADER]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i16 [[J_010]], 43 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND37_PREHEADER]], label [[FOR_COND75_PREHEADER:%.*]] ; CHECK: for.end67: ; CHECK-NEXT: [[INC69]] = add nuw nsw i16 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND13_NOT:%.*]] = icmp eq i16 [[INC69]], 2 @@ -72,12 +75,14 @@ for.body42: ; preds = %for.body42, %for.co %0 = load i32, ptr %arrayidx55, align 1 %add61 = add i32 undef, undef %inc63 = add nsw i16 %k.09, 1 - br i1 true, label %for.end64, label %for.body42 + %cmp = icmp slt i16 %k.09, 42 + br i1 %cmp, label %for.body42, label %for.end64 for.end64: ; preds = %for.body42 store i32 %add61, ptr undef, align 1 %inc66 = add nuw nsw i16 %j.010, 1 - br i1 true, label %for.end67, label %for.cond37.preheader + %cmp2 = icmp slt i16 %j.010, 43 + br i1 %cmp2, label %for.cond37.preheader, label %for.end67 for.end67: ; preds = %for.end64 %inc69 = add nuw nsw i16 %i.011, 1 @@ -88,7 +93,6 @@ for.cond75: ; preds = %for.cond75, %for.en br label %for.cond75 } - ; Make sure that we split the phi nodes in the middle loop header ; into a separate basic block to avoid the situation where use of ; the outermost indvar appears before its def after interchanging @@ -98,40 +102,42 @@ for.cond75: ; preds = %for.cond75, %for.en define void @test2() { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND37_PREHEADER_PREHEADER:%.*]] -; CHECK: for.cond33.preheader.preheader: ; CHECK-NEXT: br label [[FOR_COND33_PREHEADER:%.*]] +; CHECK: for.cond33.preheader.preheader: +; CHECK-NEXT: br label [[FOR_COND33_PREHEADER1:%.*]] ; CHECK: for.cond33.preheader: ; CHECK-NEXT: [[I_166:%.*]] = phi i16 [ [[INC69:%.*]], [[FOR_INC68:%.*]] ], [ 0, [[FOR_COND33_PREHEADER_PREHEADER:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr @c, i16 0, i16 [[I_166]], i16 [[J_165:%.*]] -; CHECK-NEXT: br label [[VECTOR_BODY85_SPLIT1:%.*]] -; CHECK: for.cond37.preheader.preheader: ; CHECK-NEXT: br label [[FOR_COND37_PREHEADER:%.*]] +; CHECK: for.cond37.preheader.preheader: +; CHECK-NEXT: br label [[FOR_COND37_PREHEADER1:%.*]] ; CHECK: for.cond37.preheader: -; CHECK-NEXT: [[J_165]] = phi i16 [ [[INC66:%.*]], [[MIDDLE_BLOCK80:%.*]] ], [ 0, [[FOR_COND37_PREHEADER_PREHEADER]] ] -; CHECK-NEXT: br label [[FOR_COND37_PREHEADER_SPLIT:%.*]] -; CHECK: for.cond37.preheader.split: +; CHECK-NEXT: [[J_165]] = phi i16 [ [[INC66:%.*]], [[MIDDLE_BLOCK80:%.*]] ], [ 0, [[FOR_COND33_PREHEADER]] ] ; CHECK-NEXT: br label [[VECTOR_BODY85:%.*]] +; CHECK: for.cond37.preheader.split: +; CHECK-NEXT: br label [[VECTOR_BODY86:%.*]] ; CHECK: vector.body85: -; CHECK-NEXT: [[INDEX86:%.*]] = phi i16 [ 0, [[FOR_COND37_PREHEADER_SPLIT]] ], [ [[TMP3:%.*]], [[VECTOR_BODY85_SPLIT:%.*]] ] +; CHECK-NEXT: [[INDEX86:%.*]] = phi i16 [ 0, [[VECTOR_BODY85]] ], [ [[TMP5:%.*]], [[VECTOR_BODY85_SPLIT:%.*]] ] ; CHECK-NEXT: br label [[FOR_COND33_PREHEADER_PREHEADER]] ; CHECK: vector.body85.split1: ; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i16 [[INDEX86]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 [[TMP0]], i16 [[J_165]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[INDEX_NEXT87:%.*]] = add nuw i16 [[INDEX86]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i16 [[INDEX86]], 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i16 [[INDEX86]], 42 ; CHECK-NEXT: br label [[FOR_INC68]] ; CHECK: vector.body85.split: -; CHECK-NEXT: [[TMP3]] = add nuw i16 [[INDEX86]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK80]], label [[VECTOR_BODY85]] +; CHECK-NEXT: [[TMP5]] = add nuw i16 [[INDEX86]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i16 [[INDEX86]], 42 +; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY86]], label [[MIDDLE_BLOCK80]] ; CHECK: middle.block80: ; CHECK-NEXT: [[INC66]] = add nuw nsw i16 [[J_165]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[INC66]], 42 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND75_PREHEADER:%.*]], label [[FOR_COND37_PREHEADER]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND37_PREHEADER1]], label [[FOR_COND75_PREHEADER:%.*]] ; CHECK: for.inc68: ; CHECK-NEXT: [[INC69]] = add nuw nsw i16 [[I_166]], 1 -; CHECK-NEXT: [[EXITCOND77_NOT:%.*]] = icmp eq i16 [[INC69]], 2 -; CHECK-NEXT: br i1 [[EXITCOND77_NOT]], label [[VECTOR_BODY85_SPLIT]], label [[FOR_COND33_PREHEADER]] +; CHECK-NEXT: [[EXITCOND77_NOT:%.*]] = icmp slt i16 [[INC69]], 24 +; CHECK-NEXT: br i1 [[EXITCOND77_NOT]], label [[FOR_COND33_PREHEADER1]], label [[VECTOR_BODY85_SPLIT]] ; CHECK: for.cond75.preheader: ; CHECK-NEXT: unreachable ; @@ -153,17 +159,18 @@ vector.body85: ; preds = %vector.body85, %for %1 = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 %0, i16 %j.165 %2 = load i32, ptr %1, align 1 %index.next87 = add nuw i16 %index86, 4 - br i1 true, label %middle.block80, label %vector.body85 + %cmp2 = icmp slt i16 %index86, 42 + br i1 %cmp2, label %vector.body85, label %middle.block80 middle.block80: ; preds = %vector.body85 %inc66 = add nuw nsw i16 %j.165, 1 %cmp = icmp slt i16 %inc66, 42 - br i1 %cmp, label %for.inc68, label %for.cond37.preheader + br i1 %cmp, label %for.cond37.preheader, label %for.inc68 for.inc68: ; preds = %middle.block80 %inc69 = add nuw nsw i16 %i.166, 1 - %exitcond77.not = icmp eq i16 %inc69, 2 - br i1 %exitcond77.not, label %for.cond75.preheader, label %for.cond33.preheader + %exitcond77.not = icmp slt i16 %inc69, 24 + br i1 %exitcond77.not, label %for.cond33.preheader, label %for.cond75.preheader for.cond75.preheader: ; preds = %for.inc68 unreachable @@ -178,11 +185,11 @@ define void @test3() { ; CHECK-NEXT: br label [[FOR_COND33_PREHEADER:%.*]] ; CHECK: for.cond33.preheader: ; CHECK-NEXT: [[I_011:%.*]] = phi i16 [ [[INC69:%.*]], [[FOR_END67:%.*]] ], [ 0, [[FOR_COND33_PREHEADER_PREHEADER:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY42_SPLIT1:%.*]] +; CHECK-NEXT: br label [[FOR_COND38_PREHEADER:%.*]] ; CHECK: for.body42.preheader: ; CHECK-NEXT: br label [[FOR_BODY42:%.*]] ; CHECK: for.cond38.preheader.preheader: -; CHECK-NEXT: br label [[FOR_COND38_PREHEADER:%.*]] +; CHECK-NEXT: br label [[FOR_COND38_PREHEADER1:%.*]] ; CHECK: for.cond37.preheader.preheader: ; CHECK-NEXT: br label [[FOR_COND37_PREHEADER:%.*]] ; CHECK: for.cond37.preheader: @@ -192,27 +199,31 @@ define void @test3() { ; CHECK-NEXT: [[K_010:%.*]] = phi i16 [ [[INC67:%.*]], [[FOR_END65:%.*]] ], [ 0, [[FOR_COND38_PREHEADER_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY42_PREHEADER:%.*]] ; CHECK: for.body42: -; CHECK-NEXT: [[K_09:%.*]] = phi i16 [ [[TMP1:%.*]], [[FOR_BODY42_SPLIT:%.*]] ], [ -512, [[FOR_BODY42_PREHEADER]] ] +; CHECK-NEXT: [[K_09:%.*]] = phi i16 [ [[TMP3:%.*]], [[FOR_BODY42_SPLIT:%.*]] ], [ -512, [[FOR_BODY42_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_COND33_PREHEADER_PREHEADER]] ; CHECK: for.body42.split1: ; CHECK-NEXT: [[SUB51:%.*]] = add nsw i16 [[K_09]], 512 ; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds [1024 x [512 x [4 x i32]]], ptr @d, i16 0, i16 [[SUB51]], i16 [[J_010]], i16 [[K_010]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX55]], align 1 ; CHECK-NEXT: [[ADD61:%.*]] = add i32 undef, undef -; CHECK-NEXT: [[INC63:%.*]] = add nsw i16 [[K_09]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i16 [[K_09]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[K_09]], 42 ; CHECK-NEXT: br label [[FOR_END67]] ; CHECK: for.body42.split: ; CHECK-NEXT: [[ADD61_LCSSA:%.*]] = phi i32 [ [[ADD61]], [[FOR_END67]] ] -; CHECK-NEXT: [[TMP1]] = add nsw i16 [[K_09]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_END65]], label [[FOR_BODY42]] +; CHECK-NEXT: [[TMP3]] = add nsw i16 [[K_09]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i16 [[K_09]], 42 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY42]], label [[FOR_END65]] ; CHECK: for.end65: ; CHECK-NEXT: [[ADD61_LCSSA_LCSSA:%.*]] = phi i32 [ [[ADD61_LCSSA]], [[FOR_BODY42_SPLIT]] ] ; CHECK-NEXT: store i32 [[ADD61_LCSSA_LCSSA]], ptr undef, align 1 ; CHECK-NEXT: [[INC67]] = add nuw nsw i16 [[K_010]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_END64]], label [[FOR_COND38_PREHEADER]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i16 [[K_010]], 44 +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_COND38_PREHEADER1]], label [[FOR_END64]] ; CHECK: for.end64: ; CHECK-NEXT: [[INC66]] = add nuw nsw i16 [[J_010]], 1 -; CHECK-NEXT: br i1 true, label [[FOR_COND75_PREHEADER:%.*]], label [[FOR_COND37_PREHEADER]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i16 [[J_010]], 43 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND37_PREHEADER]], label [[FOR_COND75_PREHEADER:%.*]] ; CHECK: for.end67: ; CHECK-NEXT: [[INC69]] = add nuw nsw i16 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND13_NOT:%.*]] = icmp eq i16 [[INC69]], 2 @@ -244,16 +255,19 @@ for.body42: ; preds = %for.body42, %for.co %0 = load i32, ptr %arrayidx55, align 1 %add61 = add i32 undef, undef %inc63 = add nsw i16 %k.09, 1 - br i1 true, label %for.end65, label %for.body42 + %cmp = icmp slt i16 %k.09, 42 + br i1 %cmp, label %for.body42, label %for.end65 for.end65: ; preds = %for.body42 store i32 %add61, ptr undef, align 1 %inc67 = add nuw nsw i16 %k.010, 1 - br i1 true, label %for.end64, label %for.cond38.preheader + %cmp3 = icmp slt i16 %k.010, 44 + br i1 %cmp3, label %for.cond38.preheader, label %for.end64 for.end64: ; preds = %for.end65 %inc66 = add nuw nsw i16 %j.010, 1 - br i1 true, label %for.end67, label %for.cond37.preheader + %cmp2 = icmp slt i16 %j.010, 43 + br i1 %cmp2, label %for.cond37.preheader, label %for.end67 for.end67: ; preds = %for.end64 %inc69 = add nuw nsw i16 %i.011, 1 diff --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll index af9e5f7e58bc4..51fda4cf1ebe1 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -loop-interchange-skip-zero-btc=false -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s From 7d68d9cf87ae32e6c2220212f78651c8366f3724 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Mon, 17 Nov 2025 09:57:41 -0800 Subject: [PATCH 6/8] Remove commented out option --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9b93339e1bedc..73df9b0f823a6 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -121,13 +121,6 @@ static cl::list Profitabilities( "Ignore profitability, force interchange (does not " "work with other options)"))); -// FIXME: this option exists mainly for a couple of tests that check some -// corner cases that is more difficult to trigger otherwise; these tests should -// be rewritten and this option removed if possible. -//static cl::opt SkipLoopsWithZeroBTC( -// "loop-interchange-skip-zero-btc", cl::init(true), cl::Hidden, -// cl::desc("Do not consider loops with a backedge taken count of 0")); - #ifndef NDEBUG static bool noDuplicateRulesAndIgnore(ArrayRef Rules) { SmallSet Set; From 9d548a55903c2dbd28e2c48bff750a99fb7b5ca3 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Tue, 18 Nov 2025 06:36:02 -0800 Subject: [PATCH 7/8] Removed BTC caching, fixed up test case. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 35 ++++++------------- .../loopnest-with-outer-btc0.ll | 5 +-- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 73df9b0f823a6..6a5fe1605d71f 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -422,11 +422,9 @@ static bool hasSupportedLoopDepth(ArrayRef LoopList, } static bool isComputableLoopNest(ScalarEvolution *SE, - ArrayRef LoopList, - std::map &LoopBTC) { + ArrayRef LoopList) { for (Loop *L : LoopList) { const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); - LoopBTC[L] = ExitCountOuter; if (isa(ExitCountOuter)) { LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); return false; @@ -548,8 +546,7 @@ class LoopInterchangeProfitability { /// Check if the loop interchange is profitable. bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix, CacheCostManager &CCM, - std::map &LoopBTC); + CharMatrix &DepMatrix, CacheCostManager &CCM); private: int getInstrOrderCost(); @@ -606,17 +603,14 @@ struct LoopInterchange { DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; LoopStandardAnalysisResults *AR = nullptr; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - // A cache to avoid recalculating the backedge-taken count for a loop. - std::map LoopBTC; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, DominatorTree *DT, LoopStandardAnalysisResults *AR, - OptimizationRemarkEmitter *ORE, - std::map &&LoopBTC) - : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE), - LoopBTC(std::move(LoopBTC)) {} + OptimizationRemarkEmitter *ORE) + : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) @@ -708,7 +702,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CCM, LoopBTC)) { + DependencyMatrix, CCM)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1468,13 +1462,7 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM, - std::map &LoopBTC) { - - auto *InnerBTC = LoopBTC[InnerLoop]; - auto *OuterBTC = LoopBTC[OuterLoop]; - assert(InnerBTC && OuterBTC && - "Loop BTC should exist in cache but not found"); + unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { // A loop with a backedge that isn't taken, e.g. an unconditional branch // true, isn't really a loop and we don't want to consider it as a // candidate. @@ -1482,6 +1470,8 @@ bool LoopInterchangeProfitability::isProfitable( // interchange for these loops, and thus this logic should be moved just // below the cost-model ignore check below. But this check is done first // to avoid the issue in #163954. + const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop); + const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop); if (InnerBTC && InnerBTC->isZero()) { LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting " "single iteration loop\n"); @@ -2128,7 +2118,6 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, LPMUpdater &U) { Function &F = *LN.getParent(); SmallVector LoopList(LN.getLoops()); - std::map LoopBTC; if (MaxMemInstrCount < 1) { LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1"); @@ -2140,7 +2129,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, if (!hasSupportedLoopDepth(LoopList, ORE)) return PreservedAnalyses::all(); // Ensure computable loop nest. - if (!isComputableLoopNest(&AR.SE, LoopList, LoopBTC)) { + if (!isComputableLoopNest(&AR.SE, LoopList)) { LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); return PreservedAnalyses::all(); } @@ -2153,9 +2142,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, }); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE, - std::move(LoopBTC)) - .run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN)) return PreservedAnalyses::all(); U.markLoopNestChanged(true); return getLoopPassPreservedAnalyses(); diff --git a/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll b/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll index ac3bbcf0511ef..613dcd4db0afc 100644 --- a/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll +++ b/llvm/test/Transforms/LoopInterchange/loopnest-with-outer-btc0.ll @@ -6,7 +6,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @D = common global [100 x [100 x [100 x i32]]] zeroinitializer -; Test for interchange in +; The outer loop's backedge isn't taken. Check the loop with BTC=0 is considered +; unprofitable, but that we still interchange the two inner loops. ; ; for(int i=0;i<1;i++) ; for(int j=0;j<100;j++) @@ -50,7 +51,7 @@ inner1.header: inner2.body: %k = phi i64 [ 0, %inner1.header ], [ %inc, %inner2.body ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], ptr @D, i64 0, i64 %i, i64 %k, i64 %j + %arrayidx8 = getelementptr inbounds [100 x [100 x i32]], ptr @D, i64 %i, i64 %k, i64 %j %0 = load i32, ptr %arrayidx8 %add = add nsw i32 %0, %t store i32 %add, ptr %arrayidx8 From 738e6b6ddbba86d863746a30df8419dcd71e383f Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Wed, 19 Nov 2025 03:25:53 -0800 Subject: [PATCH 8/8] Addressed last comments --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 6a5fe1605d71f..330b4abb9942f 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -46,7 +46,6 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include -#include #include #include @@ -1463,9 +1462,8 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { - // A loop with a backedge that isn't taken, e.g. an unconditional branch - // true, isn't really a loop and we don't want to consider it as a - // candidate. + // Do not consider loops with a backedge that isn't taken, e.g. an + // unconditional branch true/false, as candidates for interchange. // TODO: when interchange is forced, we should probably also allow // interchange for these loops, and thus this logic should be moved just // below the cost-model ignore check below. But this check is done first