Skip to content

Commit

Permalink
Only unswitch loops with uniform conditions
Browse files Browse the repository at this point in the history
Loop unswitching can be extremely harmful for a SIMT target. In case
if hoisted condition is not uniform a SIMT machine will execute both
clones of a loop sequentially. Therefor LoopUnswitch checks if the
condition is non-divergent.

Since DivergenceAnalysis adds an expensive PostDominatorTree analysis
not needed for non-SIMT targets a new option is added to avoid unneded
analysis initialization. The method getAnalysisUsage is called when
TargetTransformInfo is not yet available and we cannot use it here.
For that reason a new field DivergentTarget is added to PassManagerBuilder
to control the behavior and set this field from a target.

Differential Revision: https://reviews.llvm.org/D30796

llvm-svn: 298104
  • Loading branch information
rampitec committed Mar 17, 2017
1 parent d06b025 commit ee2dd78
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 7 deletions.
1 change: 1 addition & 0 deletions llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
Expand Up @@ -153,6 +153,7 @@ class PassManagerBuilder {
bool PrepareForLTO;
bool PrepareForThinLTO;
bool PerformThinLTO;
bool DivergentTarget;

/// Enable profile instrumentation pass.
bool EnablePGOInstrGen;
Expand Down
3 changes: 2 additions & 1 deletion llvm/include/llvm/Transforms/Scalar.h
Expand Up @@ -169,7 +169,8 @@ Pass *createLoopStrengthReducePass();
//
// LoopUnswitch - This pass is a simple loop unswitching pass.
//
Pass *createLoopUnswitchPass(bool OptimizeForSize = false);
Pass *createLoopUnswitchPass(bool OptimizeForSize = false,
bool hasBranchDivergence = false);

//===----------------------------------------------------------------------===//
//
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -216,6 +216,8 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
}

void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.DivergentTarget = true;

bool Internalize = InternalizeSymbols &&
(getOptLevel() > CodeGenOpt::None) &&
(getTargetTriple().getArch() == Triple::amdgcn);
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
Expand Up @@ -168,6 +168,7 @@ PassManagerBuilder::PassManagerBuilder() {
PGOInstrUse = RunPGOInstrUse;
PrepareForThinLTO = EnablePrepareForThinLTO;
PerformThinLTO = false;
DivergentTarget = false;
}

PassManagerBuilder::~PassManagerBuilder() {
Expand Down Expand Up @@ -307,7 +308,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
// Rotate Loop - disable header duplication at -Oz
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
MPM.add(createLICMPass()); // Hoist loop invariants
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
MPM.add(createCFGSimplificationPass());
addInstructionCombiningPass(MPM);
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
Expand Down Expand Up @@ -588,7 +589,7 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createCorrelatedValuePropagationPass());
addInstructionCombiningPass(MPM);
MPM.add(createLICMPass());
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
MPM.add(createCFGSimplificationPass());
addInstructionCombiningPass(MPM);
}
Expand Down
23 changes: 19 additions & 4 deletions llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
Expand Up @@ -33,6 +33,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
Expand Down Expand Up @@ -180,12 +181,14 @@ namespace {
// NewBlocks contained cloned copy of basic blocks from LoopBlocks.
std::vector<BasicBlock*> NewBlocks;

bool hasBranchDivergence;

public:
static char ID; // Pass ID, replacement for typeid
explicit LoopUnswitch(bool Os = false) :
explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
loopPreheader(nullptr) {
loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
}

Expand All @@ -198,6 +201,8 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
if (hasBranchDivergence)
AU.addRequired<DivergenceAnalysis>();
getLoopAnalysisUsage(AU);
}

Expand Down Expand Up @@ -367,11 +372,12 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)

Pass *llvm::createLoopUnswitchPass(bool Os) {
return new LoopUnswitch(Os);
Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
return new LoopUnswitch(Os, hasBranchDivergence);
}

/// Operator chain lattice.
Expand Down Expand Up @@ -808,6 +814,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
<< ". Cost too high.\n");
return false;
}
if (hasBranchDivergence &&
getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
DEBUG(dbgs() << "NOT unswitching loop %"
<< currentLoop->getHeader()->getName()
<< " at non-trivial condition '" << *Val
<< "' == " << *LoopCond << "\n"
<< ". Condition is divergent.\n");
return false;
}

UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
return true;
Expand Down
85 changes: 85 additions & 0 deletions llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -0,0 +1,85 @@
; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s

; Check that loop unswitch happened and condition hoisted out of the loop.
; Condition is uniform so all targets should perform unswitching.

; CHECK-LABEL: {{^}}define void @uniform_unswitch
; CHECK: entry:
; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
; CHECK-NEXT: br i1

define void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup

for.body.lr.ph: ; preds = %entry
%cmp1 = icmp eq i32 %x, 123456
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.inc
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.inc, %for.body.lr.ph
%i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
br i1 %cmp1, label %if.then, label %for.inc

if.then: ; preds = %for.body
%arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
store i32 %i.07, i32 * %arrayidx, align 4
br label %for.inc

for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}

; Check that loop unswitch does not happen if condition is divergent.

; CHECK-LABEL: {{^}}define void @divergent_unswitch
; CHECK: entry:
; CHECK: icmp
; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
; CHECK: br label
; CHECK: br i1 [[IF_COND]]

define void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup

for.body.lr.ph: ; preds = %entry
%call = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%cmp2 = icmp eq i32 %call, 567890
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.inc
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.inc, %for.body.lr.ph
%i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
br i1 %cmp2, label %if.then, label %for.inc

if.then: ; preds = %for.body
%arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.010
store i32 %i.010, i32 * %arrayidx, align 4
br label %for.inc

for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i32 %i.010, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }
2 changes: 2 additions & 0 deletions llvm/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
if not 'AMDGPU' in config.root.targets:
config.unsupported = True

0 comments on commit ee2dd78

Please sign in to comment.