Skip to content

Commit

Permalink
[AMDGPU] Break-up large PHIs for DAGISel
Browse files Browse the repository at this point in the history
DAGISel uses CopyToReg/CopyFromReg to lower PHI nodes. With large PHIs, this can result in poor codegen.
This is because it introduces a need to have a build_vector before copying the PHI value, and that build_vector may have many undef elements. This can cause very high register pressure and abnormal stack usage in some cases.

This scalarization/phi "break-up" can be easily tuned/disabled through CL options in case it's not beneficial for some users.
It's also only enabled for DAGIsel and GlobalISel handles PHIs much better (as it works on the whole function).

This can both scalarize (break a vector into its elements) and simplify (break a vector into smaller, more manageable subvectors) PHIs.

Fixes SWDEV-321581

Reviewed By: kzhuravl

Differential Revision: https://reviews.llvm.org/D143731
  • Loading branch information
Pierre-vh committed Mar 28, 2023
1 parent 7765e5d commit d892521
Show file tree
Hide file tree
Showing 16 changed files with 2,016 additions and 1,003 deletions.
111 changes: 111 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Expand Up @@ -47,6 +47,16 @@ static cl::opt<bool> Widen16BitOps(
cl::ReallyHidden,
cl::init(true));

static cl::opt<bool>
ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
cl::desc("Break large PHI nodes for DAGISel"),
cl::ReallyHidden, cl::init(true));

static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
"amdgpu-codegenprepare-break-large-phis-threshold",
cl::desc("Minimum type size in bits for breaking large PHI nodes"),
cl::ReallyHidden, cl::init(32));

static cl::opt<bool> UseMul24Intrin(
"amdgpu-codegenprepare-mul24",
cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
Expand Down Expand Up @@ -213,6 +223,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);

bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
Expand Down Expand Up @@ -1383,6 +1394,106 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
return Changed;
}

bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
// Break-up fixed-vector PHIs into smaller pieces.
// Default threshold is 32, so it breaks up any vector that's >32 bits into
// its elements, or into 32-bit pieces (for 8/16 bit elts).
//
// This is only helpful for DAGISel because it doesn't handle large PHIs as
// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
// With large, odd-sized PHIs we may end up needing many `build_vector`
// operations with most elements being "undef". This inhibits a lot of
// optimization opportunities and can result in unreasonably high register
// pressure and the inevitable stack spilling.
if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
return false;

FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
return false;

struct VectorSlice {
Type *Ty = nullptr;
unsigned Idx = 0;
unsigned NumElts = 0;
std::vector<Value *> IncomingValues = {};
PHINode *NewPHI = nullptr;
};

std::vector<VectorSlice> Slices;

Type *EltTy = FVT->getElementType();
{
unsigned Idx = 0;
// For 8/16 bits type, don't scalarize fully but break it up into as many
// 32-bit slices as we can, and scalarize the tail.
const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
const unsigned NumElts = FVT->getNumElements();
if (EltSize == 8 || EltSize == 16) {
const unsigned SubVecSize = (32 / EltSize);
Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
Idx += SubVecSize)
Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize});
}

// Scalarize all remaining elements.
for (; Idx < NumElts; ++Idx)
Slices.push_back(VectorSlice{EltTy, Idx, 1});
}

if (Slices.size() == 1)
return false;

// Break up this PHI's incoming values.
for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) {
Value *Inc = I.getIncomingValue(Idx);

IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator());
if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
B.SetCurrentDebugLocation(IncInst->getDebugLoc());

unsigned NameSuffix = 0;
for (VectorSlice &S : Slices) {
const auto ValName =
"largephi.extractslice" + std::to_string(NameSuffix++);
if (S.NumElts > 1) {
SmallVector<int, 4> Mask;
for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K)
Mask.push_back(K);
S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName));
} else
S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName));
}
}

// Now create one PHI per vector piece.
IRBuilder<> B(I.getParent()->getFirstNonPHI());
B.SetCurrentDebugLocation(I.getDebugLoc());

for (VectorSlice &S : Slices) {
S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
for (const auto &[Idx, BB] : enumerate(I.blocks()))
S.NewPHI->addIncoming(S.IncomingValues[Idx], BB);
}

// And replace this PHI with a vector of all the previous PHI values.
Value *Vec = PoisonValue::get(FVT);
unsigned NameSuffix = 0;
for (VectorSlice &S : Slices) {
const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
if (S.NumElts > 1)
Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
else
Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
}

I.replaceAllUsesWith(Vec);
I.eraseFromParent();
return true;
}

bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::bitreverse:
Expand Down

0 comments on commit d892521

Please sign in to comment.