[AMDGPU] Break-up large PHIs for DAGISel

DAGISel uses CopyToReg/CopyFromReg to lower PHI nodes. With large PHIs, this can result in poor codegen. This is because it introduces a need to have a build_vector before copying the PHI value, and that build_vector may have many undef elements. This can cause very high register pressure and abnormal stack usage in some cases. This scalarization/phi "break-up" can be easily tuned/disabled through CL options in case it's not beneficial for some users. It's also only enabled for DAGIsel and GlobalISel handles PHIs much better (as it works on the whole function). This can both scalarize (break a vector into its elements) and simplify (break a vector into smaller, more manageable subvectors) PHIs. Fixes SWDEV-321581 Reviewed By: kzhuravl Differential Revision: https://reviews.llvm.org/D143731
llvm · Mar 28, 2023 · d892521 · d892521
1 parent 7765e5d
commit d892521
Show file tree

Hide file tree

Showing 16 changed files with 2,016 additions and 1,003 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -47,6 +47,16 @@ static cl::opt<bool> Widen16BitOps(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool>
+    ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
+                       cl::desc("Break large PHI nodes for DAGISel"),
+                       cl::ReallyHidden, cl::init(true));
+
+static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
+    "amdgpu-codegenprepare-break-large-phis-threshold",
+    cl::desc("Minimum type size in bits for breaking large PHI nodes"),
+    cl::ReallyHidden, cl::init(32));
+
 static cl::opt<bool> UseMul24Intrin(
   "amdgpu-codegenprepare-mul24",
   cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -213,6 +223,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   bool visitLoadInst(LoadInst &I);
   bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
+  bool visitPHINode(PHINode &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -1383,6 +1394,106 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
   return Changed;
 }
 
+bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
+  // Break-up fixed-vector PHIs into smaller pieces.
+  // Default threshold is 32, so it breaks up any vector that's >32 bits into
+  // its elements, or into 32-bit pieces (for 8/16 bit elts).
+  //
+  // This is only helpful for DAGISel because it doesn't handle large PHIs as
+  // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
+  // With large, odd-sized PHIs we may end up needing many `build_vector`
+  // operations with most elements being "undef". This inhibits a lot of
+  // optimization opportunities and can result in unreasonably high register
+  // pressure and the inevitable stack spilling.
+  if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
+    return false;
+
+  FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
+  if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
+    return false;
+
+  struct VectorSlice {
+    Type *Ty = nullptr;
+    unsigned Idx = 0;
+    unsigned NumElts = 0;
+    std::vector<Value *> IncomingValues = {};
+    PHINode *NewPHI = nullptr;
+  };
+
+  std::vector<VectorSlice> Slices;
+
+  Type *EltTy = FVT->getElementType();
+  {
+    unsigned Idx = 0;
+    // For 8/16 bits type, don't scalarize fully but break it up into as many
+    // 32-bit slices as we can, and scalarize the tail.
+    const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
+    const unsigned NumElts = FVT->getNumElements();
+    if (EltSize == 8 || EltSize == 16) {
+      const unsigned SubVecSize = (32 / EltSize);
+      Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
+      for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
+           Idx += SubVecSize)
+        Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize});
+    }
+
+    // Scalarize all remaining elements.
+    for (; Idx < NumElts; ++Idx)
+      Slices.push_back(VectorSlice{EltTy, Idx, 1});
+  }
+
+  if (Slices.size() == 1)
+    return false;
+
+  // Break up this PHI's incoming values.
+  for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) {
+    Value *Inc = I.getIncomingValue(Idx);
+
+    IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator());
+    if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
+      B.SetCurrentDebugLocation(IncInst->getDebugLoc());
+
+    unsigned NameSuffix = 0;
+    for (VectorSlice &S : Slices) {
+      const auto ValName =
+          "largephi.extractslice" + std::to_string(NameSuffix++);
+      if (S.NumElts > 1) {
+        SmallVector<int, 4> Mask;
+        for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K)
+          Mask.push_back(K);
+        S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName));
+      } else
+        S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName));
+    }
+  }
+
+  // Now create one PHI per vector piece.
+  IRBuilder<> B(I.getParent()->getFirstNonPHI());
+  B.SetCurrentDebugLocation(I.getDebugLoc());
+
+  for (VectorSlice &S : Slices) {
+    S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
+    for (const auto &[Idx, BB] : enumerate(I.blocks()))
+      S.NewPHI->addIncoming(S.IncomingValues[Idx], BB);
+  }
+
+  // And replace this PHI with a vector of all the previous PHI values.
+  Value *Vec = PoisonValue::get(FVT);
+  unsigned NameSuffix = 0;
+  for (VectorSlice &S : Slices) {
+    const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
+    if (S.NumElts > 1)
+      Vec =
+          B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
+    else
+      Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
+  }
+
+  I.replaceAllUsesWith(Vec);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse: