Skip to content

Commit

Permalink
[LoopFlatten] Move it from LPM2 to LPM1
Browse files Browse the repository at this point in the history
In D110057 we moved LoopFlatten to a LoopPassManager. This caused a performance
regression for our 64-bit targets (the 32-bit were unaffected), the pass is no
longer triggering for a motivating example. The reason is that the IR is just
very different than expected; we try to match loop statements and particular
uses of induction variables. The easiest is to just move LoopFlatten to a place
in the pipeline where the IR is as expected, which is just before
IndVarSimplify. This means we move it from LPM2 to LPM1, so that it actually
runs just a bit earlier from where it was running before. IndVarSimplify is
responsible for significant rewrites that are difficult to "look through" in
LoopFlatten.

Differential Revision: https://reviews.llvm.org/D116612
  • Loading branch information
Sjoerd Meijer committed Jan 19, 2022
1 parent 016022e commit f269ec2
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 6 deletions.
13 changes: 7 additions & 6 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Expand Up @@ -300,6 +300,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
// TODO: Investigate promotion cap for O1.
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
LPM1.addPass(SimpleLoopUnswitchPass());
if (EnableLoopFlatten)
LPM1.addPass(LoopFlattenPass());

LPM2.addPass(LoopIdiomRecognizePass());
LPM2.addPass(IndVarSimplifyPass());
Expand All @@ -311,8 +313,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,

if (EnableLoopInterchange)
LPM2.addPass(LoopInterchangePass());
if (EnableLoopFlatten)
LPM2.addPass(LoopFlattenPass());

// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
// because it changes IR to makes profile annotation in back compile
Expand Down Expand Up @@ -475,6 +475,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
LPM1.addPass(
SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
EnableO3NonTrivialUnswitching));
if (EnableLoopFlatten)
LPM1.addPass(LoopFlattenPass());

LPM2.addPass(LoopIdiomRecognizePass());
LPM2.addPass(IndVarSimplifyPass());

Expand All @@ -485,8 +488,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,

if (EnableLoopInterchange)
LPM2.addPass(LoopInterchangePass());
if (EnableLoopFlatten)
LPM2.addPass(LoopFlattenPass());

// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
// because it changes IR to makes profile annotation in back compile
Expand Down Expand Up @@ -1628,10 +1629,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MainFPM.addPass(ConstraintEliminationPass());

LoopPassManager LPM;
LPM.addPass(IndVarSimplifyPass());
LPM.addPass(LoopDeletionPass());
if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
LPM.addPass(LoopFlattenPass());
LPM.addPass(IndVarSimplifyPass());
LPM.addPass(LoopDeletionPass());
// FIXME: Add loop interchange.

// Unroll small loops and perform peeling.
Expand Down
64 changes: 64 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='default<O3>' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"

define dso_local void @_Z3fooPiii(i32* %A, i32 %N, i32 %M) #0 {
; CHECK-LABEL: @_Z3fooPiii(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[M:%.*]], 0
; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP3]], i1 [[CMP21]], i1 false
; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.cond1.preheader.lr.ph.split.us:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[M]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw nsw i64 [[TMP0]], [[TMP1]]
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]]
; CHECK: for.cond1.preheader.us:
; CHECK-NEXT: [[INDVAR6:%.*]] = phi i64 [ [[INDVAR_NEXT7:%.*]], [[FOR_COND1_PREHEADER_US]] ], [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ]
; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR6]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
; CHECK-NEXT: tail call void @_Z1fi(i32 [[TMP2]])
; CHECK-NEXT: [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
entry:
br label %for.cond

for.cond:
%i.0 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ]
%cmp = icmp slt i32 %i.0, %N
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void

for.body:
br label %for.cond1

for.cond1:
%j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
%cmp2 = icmp slt i32 %j.0, %M
br i1 %cmp2, label %for.body4, label %for.cond.cleanup3

for.cond.cleanup3:
%inc6 = add nsw i32 %i.0, 1
br label %for.cond

for.body4:
%mul = mul nsw i32 %i.0, %M
%add = add nsw i32 %mul, %j.0
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
%0 = load i32, i32* %arrayidx, align 4
call void @_Z1fi(i32 %0)
%inc = add nsw i32 %j.0, 1
br label %for.cond1
}

declare dso_local void @_Z1fi(i32) #2

0 comments on commit f269ec2

Please sign in to comment.