Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -562,9 +562,13 @@ class AMDGPURewriteAGPRCopyMFMAPass
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;

void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();

struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

namespace AMDGPU {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS

#ifndef MODULE_PASS_WITH_PARAMS
Expand Down Expand Up @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef FUNCTION_PASS

#ifndef FUNCTION_ANALYSIS
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}

static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
Expand Down Expand Up @@ -884,9 +885,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());

if (EnableUniformIntrinsicCombine)
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});

PB.registerPeepholeEPCallback(
Expand All @@ -897,6 +895,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());

if (EnableUniformIntrinsicCombine)
FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
});

PB.registerCGSCCOptimizerLateEPCallback(
Expand Down
78 changes: 55 additions & 23 deletions llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
///
/// This pass is implemented as a ModulePass because intrinsic declarations
/// exist at the module scope, allowing us to skip processing entirely if no
/// declarations are present and to traverse their user lists directly when
/// they are. A FunctionPass would instead require scanning every instruction
/// in every function to find relevant intrinsics, which is far less efficient.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
Expand Down Expand Up @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
ICmp->eraseFromParent();
Changed = true;
}
}
Expand All @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}

/// Iterates over intrinsic declarations in the module to optimize their uses.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
/// Iterates over intrinsic calls in the Function to optimize.
static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;

FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
switch (F.getIntrinsicID()) {
for (Instruction &I : make_early_inc_range(instructions(F))) {
auto *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;

switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
Expand All @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
default:
continue;
}

for (User *U : make_early_inc_range(F.users())) {
auto *II = cast<IntrinsicInst>(U);
Function *ParentF = II->getFunction();
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
}

PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
if (!runUniformIntrinsicCombine(M, AM))
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
if (!runUniformIntrinsicCombine(F, UI))
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}

namespace {
class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
initializeAMDGPUUniformIntrinsicCombineLegacyPass(
*PassRegistry::getPassRegistry());
}

private:
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
};
} // namespace

char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
AMDGPUUniformIntrinsicCombineLegacy::ID;

bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
const UniformityInfo &UI =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
return runUniformIntrinsicCombine(F, UI);
}

INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU Uniform Intrinsic Combine", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU Uniform Intrinsic Combine", false, false)

FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
return new AMDGPUUniformIntrinsicCombineLegacy();
}
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
Expand Down Expand Up @@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
Expand Down Expand Up @@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
Expand Down Expand Up @@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
Expand Down