Skip to content

Commit

Permalink
[amdgpu] Add the late codegen preparation pass.
Browse files Browse the repository at this point in the history
Summary:
- Teach that pass to widen naturally aligned but not DWORD aligned
  sub-DWORD loads.

Reviewers: rampitec, arsenm

Subscribers:

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80364
  • Loading branch information
darkbuck committed Oct 27, 2020
1 parent 64d3ed3 commit 46c3d5c
Show file tree
Hide file tree
Showing 6 changed files with 252 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Expand Up @@ -68,6 +68,7 @@ FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
Expand Down Expand Up @@ -223,6 +224,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
extern char &AMDGPUCodeGenPrepareID;

void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareID;

void initializeSIAnnotateControlFlowPass(PassRegistry&);
extern char &SIAnnotateControlFlowPassID;

Expand Down
198 changes: 198 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -0,0 +1,198 @@
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
/// selection.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <iterator>

#define DEBUG_TYPE "amdgpu-late-codegenprepare"

using namespace llvm;

// Scalar load widening needs running after load-store-vectorizer as that pass
// doesn't handle overlapping cases. In addition, this pass enhances the
// widening to handle cases where scalar sub-dword loads are naturally aligned
// only but not dword aligned.
static cl::opt<bool>
WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
cl::desc("Widen sub-dword constant address space loads in "
"AMDGPULateCodeGenPrepare"),
cl::ReallyHidden, cl::init(true));

namespace {

class AMDGPULateCodeGenPrepare
: public FunctionPass,
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
Module *Mod = nullptr;
const DataLayout *DL = nullptr;

AssumptionCache *AC = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;

public:
static char ID;

AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}

StringRef getPassName() const override {
return "AMDGPU IR late optimizations";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<LegacyDivergenceAnalysis>();
AU.setPreservesAll();
}

bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;

bool visitInstruction(Instruction &) { return false; }

// Check if the specified value is at least DWORD aligned.
bool isDWORDAligned(const Value *V) const {
KnownBits Known = computeKnownBits(V, *DL, 0, AC);
return Known.countMinTrailingZeros() >= 2;
}

bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);
};

} // end anonymous namespace

bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
DL = &Mod->getDataLayout();
return false;
}

bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;

AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<LegacyDivergenceAnalysis>();

bool Changed = false;
for (auto &BB : F)
for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
Instruction *I = &*BI++;
Changed |= visit(*I);
}

return Changed;
}

bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
// Skip non-simple loads.
if (!LI.isSimple())
return false;
auto *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
unsigned TySize = DL->getTypeStoreSize(Ty);
// Only handle sub-DWORD loads.
if (TySize >= 4)
return false;
// That load must be at least naturally aligned.
if (LI.getAlign() < DL->getABITypeAlign(Ty))
return false;
// It should be uniform, i.e. a scalar load.
return DA->isUniform(&LI);
}

bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
if (!WidenLoads)
return false;

// Skip if that load is already aligned on DWORD at least as it's handled in
// SDAG.
if (LI.getAlign() >= 4)
return false;

if (!canWidenScalarExtLoad(LI))
return false;

int64_t Offset = 0;
auto *Base =
GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
// If that base is not DWORD aligned, it's not safe to perform the following
// transforms.
if (!isDWORDAligned(Base))
return false;

int64_t Adjust = Offset & 0x3;
if (Adjust == 0) {
// With a zero adjust, the original alignment could be promoted with a
// better one.
LI.setAlignment(Align(4));
return true;
}

IRBuilder<> IRB(&LI);
IRB.SetCurrentDebugLocation(LI.getDebugLoc());

unsigned AS = LI.getPointerAddressSpace();
unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);

PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
auto *NewPtr = IRB.CreateBitCast(
IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
Offset - Adjust),
Int32PtrTy);
LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
NewLd->copyMetadata(LI);
NewLd->setMetadata(LLVMContext::MD_range, nullptr);

unsigned ShAmt = Adjust * 8;
auto *NewVal = IRB.CreateBitCast(
IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
LI.replaceAllUsesWith(NewVal);
RecursivelyDeleteTriviallyDeadInstructions(&LI);

return true;
}

INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)

char AMDGPULateCodeGenPrepare::ID = 0;

FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
return new AMDGPULateCodeGenPrepare();
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -236,6 +236,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
Expand Down Expand Up @@ -865,6 +866,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();

addPass(createAMDGPULateCodeGenPreparePass());
if (EnableAtomicOptimizations) {
addPass(createAMDGPUAtomicOptimizerPass());
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Expand Up @@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUISelDAGToDAG.cpp
AMDGPUISelLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPULateCodeGenPrepare.cpp
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
Expand Up @@ -14,6 +14,22 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
ret void
}

; GCN-LABEL: {{^}}test2
; GCN: enable_sgpr_dispatch_ptr = 1
; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1
; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16
; GCN-NOT: load_ushort
; GCN: s_endpgm
define amdgpu_kernel void @test2(i32 addrspace(1)* %out) {
%dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
%d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6
%h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)*
%v1 = load i16, i16 addrspace(4)* %h1
%e1 = zext i16 %v1 to i32
store i32 %e1, i32 addrspace(1)* %out
ret void
}

declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

attributes #0 = { readnone }
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
Expand Up @@ -22,6 +22,37 @@ entry:
ret void
}

; A little more complicated case where more sub-dword loads could be coalesced
; if they are not widening earlier.
; GCN-LABEL: {{^}}load_4i16:
; GCN: s_load_dwordx2 s{{\[}}[[D0:[0-9]+]]:[[D1:[0-9]+]]{{\]}}, s[4:5], 0x4
; GCN-NOT: s_load_dword {{s[0-9]+}}, s[4:5], 0x4
; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16
; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16
; GCN: s_endpgm
define protected amdgpu_kernel void @load_4i16(i32 addrspace(1)* %out) {
entry:
%disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
%gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4
%gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)*
%id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x
%gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6
%gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)*
%id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y
%gep_z = getelementptr i8, i8 addrspace(4)* %disp, i64 8
%gep_z.cast = bitcast i8 addrspace(4)* %gep_z to i16 addrspace(4)*
%id_z = load i16, i16 addrspace(4)* %gep_z.cast, align 4, !invariant.load !0 ; load workgroup size x
%gep_w = getelementptr i8, i8 addrspace(4)* %disp, i64 10
%gep_w.cast = bitcast i8 addrspace(4)* %gep_w to i16 addrspace(4)*
%id_w = load i16, i16 addrspace(4)* %gep_w.cast, align 2, !invariant.load !0 ; load workgroup size y
%add = add nuw nsw i16 %id_y, %id_x
%add2 = add nuw nsw i16 %id_z, %id_w
%add3 = add nuw nsw i16 %add, %add2
%conv = zext i16 %add3 to i32
store i32 %conv, i32 addrspace(1)* %out, align 4
ret void
}

declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()

!0 = !{!0}

0 comments on commit 46c3d5c

Please sign in to comment.