Skip to content

Commit

Permalink
[AMDGPUUnifyDivergentExitNodes] Add NewPM support
Browse files Browse the repository at this point in the history
Meanwhile, use UniformityAnalysis instead of LegacyDivergenceAnalysis to collect divergence info.

Reviewed By: arsenm, sameerds

Differential Revision: https://reviews.llvm.org/D141355
  • Loading branch information
gandhi56 committed Mar 25, 2023
1 parent 731264b commit b48e7c2
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 51 deletions.
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -22,6 +22,7 @@
#include "AMDGPURegBankSelect.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUUnifyDivergentExitNodes.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "GCNVOPDUtils.h"
Expand Down Expand Up @@ -655,6 +656,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
return true;
}
if (PassName == "amdgpu-unify-divergent-exit-nodes") {
PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
return true;
}
return false;
});

Expand Down
71 changes: 50 additions & 21 deletions llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
Expand Up @@ -19,6 +19,7 @@
//
//===----------------------------------------------------------------------===//

#include "AMDGPUUnifyDivergentExitNodes.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
Expand Down Expand Up @@ -53,40 +54,48 @@ using namespace llvm;

namespace {

class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
class AMDGPUUnifyDivergentExitNodesImpl {
private:
const TargetTransformInfo *TTI = nullptr;

public:
static char ID; // Pass identification, replacement for typeid

AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
}
AMDGPUUnifyDivergentExitNodesImpl() = delete;
AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI)
: TTI(TTI) {}

// We can preserve non-critical-edgeness when we unify function exit nodes
void getAnalysisUsage(AnalysisUsage &AU) const override;
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name);
bool runOnFunction(Function &F) override;
bool run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT,
const UniformityInfo &UA);
};

class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
public:
static char ID;
AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
initializeAMDGPUUnifyDivergentExitNodesPass(
*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnFunction(Function &F) override;
};
} // end anonymous namespace

char AMDGPUUnifyDivergentExitNodes::ID = 0;

char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;

INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
"Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)

void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
if (RequireAndPreserveDomTree)
AU.addRequired<DominatorTreeWrapperPass>();

Expand Down Expand Up @@ -132,7 +141,7 @@ static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) {
return true;
}

BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
Expand Down Expand Up @@ -180,21 +189,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
return NewRetBlock;
}

bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
DominatorTree *DT = nullptr;
if (RequireAndPreserveDomTree)
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;

UniformityInfo &UA =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
Expand Down Expand Up @@ -327,3 +329,30 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}

bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
DominatorTree *DT = nullptr;
if (RequireAndPreserveDomTree)
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
const auto &PDT =
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
const auto *TranformInfo =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, DT, PDT, UA);
}

PreservedAnalyses
AMDGPUUnifyDivergentExitNodesPass::run(Function &F,
FunctionAnalysisManager &AM) {
DominatorTree *DT = nullptr;
if (RequireAndPreserveDomTree)
DT = &AM.getResult<DominatorTreeAnalysis>(F);

const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F);
return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, DT, PDT, UA)
? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
31 changes: 31 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
@@ -0,0 +1,31 @@
//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
// there is at most one ret and one unreachable instruction, it ensures there is
// at most one divergent exiting block.
//
// StructurizeCFG can't deal with multi-exit regions formed by branches to
// multiple return nodes. It is not desirable to structurize regions with
// uniform branches, so unifying those to the same return block as divergent
// branches inhibits use of scalar branching. It still can't deal with the case
// where one branch goes to return, and one unreachable. Replace unreachable in
// this case with a return.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"

namespace llvm {
class AMDGPUUnifyDivergentExitNodesPass
: public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> {
public:
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

} // end namespace llvm
72 changes: 42 additions & 30 deletions llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -1,36 +1,48 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA

define void @nested_inf_loop(i1 %0, i1 %1) {
; CHECK-LABEL: nested_inf_loop:
; CHECK-NEXT: %bb.0: ; %BB
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: .LBB0_1: ; %BB1
; CHECK: s_and_b64 s[10:11], exec, s[6:7]
; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: %bb.2: ; %BB2
; CHECK: s_or_b64 exec, exec, s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: .LBB0_3: ; %BB4
; CHECK: s_and_b64 s[10:11], exec, s[4:5]
; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
; CHECK-NEXT: %bb.4: ; %loop.exit.guard
; CHECK: s_or_b64 exec, exec, s[8:9]
; CHECK-NEXT: s_mov_b64 vcc, 0
; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: s_branch .LBB0_1
; CHECK-NEXT: %bb.5: ; %DummyReturnBlock
; CHECK-NEXT: s_setpc_b64 s[30:31]
; OPT-LABEL: @nested_inf_loop(
; OPT-NEXT: BB:
; OPT-NEXT: br label [[BB1:%.*]]
; OPT: BB1:
; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
; OPT: infloop:
; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
; OPT: DummyReturnBlock:
; OPT-NEXT: ret void
;
; ISA-LABEL: nested_inf_loop:
; ISA-NEXT: %bb.0: ; %BB
; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; ISA-NEXT: v_and_b32_e32 v1, 1, v1
; ISA-NEXT: v_and_b32_e32 v0, 1, v0
; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: .LBB0_1: ; %BB1
; ISA: s_and_b64 s[10:11], exec, s[6:7]
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
; ISA-NEXT: s_cbranch_execnz .LBB0_1
; ISA-NEXT: %bb.2: ; %BB2
; ISA: s_or_b64 exec, exec, s[8:9]
; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: .LBB0_3: ; %BB4
; ISA: s_and_b64 s[10:11], exec, s[4:5]
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
; ISA-NEXT: s_cbranch_execnz .LBB0_3
; ISA-NEXT: %bb.4: ; %loop.exit.guard
; ISA: s_or_b64 exec, exec, s[8:9]
; ISA-NEXT: s_mov_b64 vcc, 0
; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: s_branch .LBB0_1
; ISA-NEXT: %bb.5: ; %DummyReturnBlock
; ISA-NEXT: s_setpc_b64 s[30:31]
BB:
br label %BB1

Expand Down

0 comments on commit b48e7c2

Please sign in to comment.