[NaryReassociate] Teach NaryReassociate about UniformityAnalysis#175167
[NaryReassociate] Teach NaryReassociate about UniformityAnalysis#175167PankajDwivedi-25 wants to merge 2 commits intomainfrom
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Pankaj Dwivedi (PankajDwivedi-25) ChangesThis patch integrates Full diff: https://github.com/llvm/llvm-project/pull/175167.diff 3 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index f0474bc4352e3..4318945bd82b3 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -80,6 +80,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/ValueHandle.h"
@@ -106,7 +107,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
// Glue for old PM.
bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_);
+ TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
private:
// Runs only one iteration of the dominator-based algorithm. See the header
@@ -183,6 +184,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
TargetTransformInfo *TTI;
+ UniformityInfo *UI;
// A lookup table quickly telling which instructions compute the given SCEV.
// Note that there can be multiple instructions at different locations
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index ec145f2f48bea..4f1634b4eb7cf 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -104,6 +104,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -187,7 +188,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- if (!runImpl(F, AC, DT, SE, TLI, TTI))
+ // UniformityInfo is optional - only available on targets that benefit from
+ // uniformity-aware reassociation (e.g., GPU targets like AMDGPU).
+ auto *UI = AM.getCachedResult<UniformityInfoAnalysis>(F);
+
+ if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -199,12 +204,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
DominatorTree *DT_, ScalarEvolution *SE_,
TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_) {
+ TargetTransformInfo *TTI_,
+ UniformityInfo *UI_) {
AC = AC_;
DT = DT_;
SE = SE_;
TLI = TLI_;
TTI = TTI_;
+ UI = UI_;
DL = &F.getDataLayout();
bool Changed = false, ChangedInThisIteration;
@@ -379,13 +386,46 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
// IndexToSplit = LHS + RHS.
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
- return NewGEP;
- // Symmetrically, try IndexToSplit = RHS + LHS.
- if (LHS != RHS) {
+ // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
+ // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
+ // So the RHS becomes the "remaining" index calculation.
+ //
+ // For uniformity: prefer the remaining calculation to be uniform, as it
+ // can then stay in scalar registers. So if RHS is uniform but LHS is
+ // divergent, try LHS first (leaving uniform RHS as remainder).
+ if (UI && UI->isUniform(RHS) && !UI->isUniform(LHS)) {
+ LLVM_DEBUG(
+ dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ } else if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+ LLVM_DEBUG(
+ dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+ // LHS is uniform, prefer it as remainder - try RHS first
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ } else {
+ // Default order
if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
return NewGEP;
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
}
}
return nullptr;
@@ -483,15 +523,47 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
// = (A op RHS) op B or (B op RHS) op A
const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
const SCEV *RHSExpr = SE->getSCEV(RHS);
- if (BExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
- return NewI;
- }
- if (AExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
- return NewI;
+
+ // When uniformity analysis is available (e.g., on GPU targets), prefer
+ // reassociations that group uniform values together. This allows
+ // intermediate results to stay in scalar registers (SGPRs on AMDGPU),
+ // reducing vector register (VGPR) pressure.
+ //
+ // For I = (A op B) op RHS, we can form:
+ // - (A op RHS) op B: groups A and RHS
+ // - (B op RHS) op A: groups B and RHS
+ //
+ // Prefer the grouping where both operands in the new sub-expression are
+ // uniform, as this sub-expression can then be computed in scalar registers.
+ //
+ // We only need to handle the case where B and RHS are uniform but A is
+ // divergent. The symmetric case (A and RHS uniform, B divergent) is already
+ // handled by the default order which tries (A op RHS) op B first.
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I
+ << "\n");
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ } else {
+ // Default order: try (A op RHS) op B first
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
}
}
return nullptr;
@@ -653,16 +725,35 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
const SCEV *BExpr = SE->getSCEV(B);
const SCEV *RHSExpr = SE->getSCEV(RHS);
- if (BExpr != RHSExpr) {
- // Try (A op RHS) op B
- if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
- return NewMinMax;
- }
-
- if (AExpr != RHSExpr) {
- // Try (RHS op B) op A
- if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
- return NewMinMax;
+ // Similar to binary ops, prefer grouping uniform values together when
+ // uniformity analysis is available.
+ // For I = minmax(minmax(A, B), RHS), we can form:
+ // - minmax(minmax(A, RHS), B): groups A and RHS
+ // - minmax(minmax(B, RHS), A): groups B and RHS
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I
+ << "\n");
+ // Try (B op RHS) op A first - groups uniform B with uniform RHS
+ if (AExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
+ }
+ if (BExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ } else {
+ // Default order
+ if (BExpr != RHSExpr) {
+ // Try (A op RHS) op B
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ if (AExpr != RHSExpr) {
+ // Try (RHS op B) op A
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
+ }
}
return nullptr;
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
new file mode 100644
index 0000000000000..0ba96bf039b34
--- /dev/null
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='require<uniformity>,nary-reassociate' -S | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @use(i32)
+
+; Test that NaryReassociate prefers grouping uniform values together when
+; uniformity analysis is available and both reassociation options exist.
+;
+; For I = (A op B) op RHS, the pass can form:
+; - (A op RHS) op B
+; - (B op RHS) op A
+;
+; When both dominating expressions exist, prefer the one grouping uniforms.
+
+; Both %d_u2 and %u1_u2 exist as dominating expressions.
+; For (d + u1) + u2:
+; - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1
+; - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent
+; So prefer (u1 + u2) + d, returning %u1_u2 + d
+;
+
+define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = add i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = add i32 %d, %u2 ; divergent + uniform
+ %u1_u2 = add i32 %u1, %u2 ; uniform + uniform (should be preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist
+ ; Should prefer (u1 + u2) + d to group uniforms
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = mul i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = mul i32 %d, %u2
+ %u1_u2 = mul i32 %u1, %u2
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = mul i32 %d, %u1
+ %result = mul i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @only_one_option(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Only u1 + u2 exists, not d + u2
+ %u1_u2 = add i32 %u1, %u2
+ call void @use(i32 %u1_u2)
+
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; When no dominating expression exists, no reassociation happens
+define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[D]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[TMP]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; No dominating expressions exist
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test smax: prefer grouping uniform values together
+; For smax(smax(A, B), RHS):
+; - smax(smax(A, RHS), B): groups A and RHS
+; - smax(smax(B, RHS), A): groups B and RHS
+; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A)
+define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2) ; divergent, uniform
+ %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2) ; uniform, uniform (preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist
+ ; Should prefer smax(smax(u1, u2), d) to group uniforms
+ %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test umin: prefer grouping uniform values together
+define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2)
+ %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test GEP: prefer uniform remainder in index calculation
+; For GEP with index = LHS + RHS:
+; - If RHS is uniform, prefer LHS first (uniform RHS as remainder)
+; - If LHS is uniform, prefer RHS first (uniform LHS as remainder)
+define amdgpu_kernel void @prefer_uniform_gep_remainder(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_gep_remainder(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_ext = zext i32 %d to i64
+
+ ; Create dominating GEP with divergent index
+ %gep_d = getelementptr i32, ptr %base, i64 %d_ext
+ call void @use_ptr(ptr %gep_d)
+
+ ; GEP with index = d + u_offset
+ ; Should prefer finding dominating GEP with d, leaving uniform u_offset as remainder
+ %idx = add i64 %d_ext, %u_offset
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare void @use_ptr(ptr)
|
|
Is it normal that there are no changes on AMDGPU's tests? |
| if (!runImpl(F, AC, DT, SE, TLI, TTI)) | ||
| // UniformityInfo is optional - only available on targets that benefit from | ||
| // uniformity-aware reassociation (e.g., GPU targets like AMDGPU). | ||
| auto *UI = AM.getCachedResult<UniformityInfoAnalysis>(F); |
There was a problem hiding this comment.
This doesn't seem right. We should require the analysis on all targets, but on targets that don't care about divergence the analysis should do no work and just say that everything is uniform.
There was a problem hiding this comment.
And if you do want to make the analysis optional, the way to do it is via a pass option like -passes=nary-reassociate<with-uniformity> and then conditionally require the analysis.
getCachedResult is only suitable for analyses that you preserve but do not use. Otherwise you'll end up with random behavior based on details of analysis invalidation.
There was a problem hiding this comment.
Thanks for the clarification! I've updated to use getResult/getAnalysis instead of getCachedResult/getAnalysisIfAvailable. The analysis was getting invalidated by intermediate passes (e.g., inliner) before NaryReassociate ran, so we now compute it directly to ensure it's always available. On targets without branch divergence, this has no cost as the analysis does no work.
No existing test files changed because this patch uses In the AMDGPU pipeline, although AMDGPUUniformIntrinsicCombinePass computes UniformityAnalysis, it gets invalidated by inliner passes before NaryReassociate runs. Enabling this in the AMDGPU pipeline requires either a follow-up PR to add RequireAnalysisPass() before NaryReassociate, or changing to getResult to compute it within the pass (though this adds overhead to all targets). Existing tests don't have it available, and even if available, they lack the divergent values needed to trigger the preference logic. |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| @@ -0,0 +1,222 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 | |||
There was a problem hiding this comment.
Can you precommit this test? The new changes introduced by this patch would be easier to review with that.
c3a1422 to
2e51a8a
Compare
| @@ -183,6 +184,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> { | |||
| ScalarEvolution *SE; | |||
| TargetLibraryInfo *TLI; | |||
| TargetTransformInfo *TTI; | |||
| UniformityInfo *UI; | |||
There was a problem hiding this comment.
I skimmed through the code of UniformityInfo and I saw that we use a set to keep track of divergent values.
When we reassociate the operands, the newly created instructions will be considered as uniform, since they are not in the set. I suspect that this could be a problem.
Did I miss something ?
There was a problem hiding this comment.
You're right that newly created instructions won't be in the divergent set and would appear uniform. However, this isn't a problem here because:
- We only query UniformityInfo for the original operands (A, B, RHS) when deciding the reassociation order - not for newly created instructions
- These operands existed before the pass ran, so their uniformity is correctly tracked
- The newly created instructions are added to SeenExprs for SCEV-based matching, but their uniformity is never queried
This is similar to what we encountered in AMDGPUUniformIntrinsicCombine (#116953) - as long as we don't query uniformity of instructions created within the pass, we're fine.
There was a problem hiding this comment.
But NaryReassociate is iterative. It runs until we reach a fixed point. SeenExprs is cleared at every iteration.
There was a problem hiding this comment.
Right, I get your point.
So, a fix for that can be something similar in the AMDGPUUniformIntrinsicCombine pass or maintain a list of all the newly created instructions in this pass to check if A, B, or RHS are instructions created by the pass (not in original function). If so, skip uniformity preference and use the default order.
There was a problem hiding this comment.
I think that using another separate data-structure to preserve the analysis (as it was done in AMDGPUUniformIntrinsicCombine) would add technical debt. It looks like we have 2 use-cases where updating UniformityInfo seems to be desired.
Since we see no changes in the tests, I do not see any hurry to take the easy route.
There was a problem hiding this comment.
I'm not sure it will be of any use here, but we implemented a prototype for updatable uniformity analysis here.
Unfortunately, I don't have time to continue the project at the moment. But if there are any questions, I'd be happy to answer them!
One of the most fundamental problems making it hard to integrate into upstream is that it relies on a newly added way to register RAUW and Delete callbacks with values. While it has low overhead when unused, that overhead is not zero, and hard to measure as well.
There was a problem hiding this comment.
Current plan: Instead of tracking divergent values (unknown = uniform), we track uniform values (unknown = divergent). This conservatively treats newly created instructions as divergent, avoiding the need for complex uniformity updates during iterative passes like NaryReassociate. While this may miss some optimization opportunities within a single pass, it ensures correctness and simplicity. Fresh UniformityInfo is computed for each subsequent pass anyway.
6a28bd6 to
72d8c9a
Compare
You can test this locally with the following command:git diff origin/main HEAD -- 'llvm/include/llvm/**/*.h' 'llvm/include/llvm-c/**/*.h' 'llvm/include/llvm/Demangle/**/*.h'
Then run idt on the changed files with appropriate --export-macro and --include-header flags.
View the diff from ids-check here.diff --git a/llvm/include/llvm/Analysis/UniformityAnalysis.h b/llvm/include/llvm/Analysis/UniformityAnalysis.h
index a067a9b42..a58a2d714 100644
--- a/llvm/include/llvm/Analysis/UniformityAnalysis.h
+++ b/llvm/include/llvm/Analysis/UniformityAnalysis.h
@@ -14,6 +14,7 @@
#ifndef LLVM_ANALYSIS_UNIFORMITYANALYSIS_H
#define LLVM_ANALYSIS_UNIFORMITYANALYSIS_H
+#include "llvm/include/llvm/Support/Compiler.h"
#include "llvm/ADT/GenericUniformityInfo.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/SSAContext.h"
@@ -35,7 +36,7 @@ public:
using Result = UniformityInfo;
/// Run the analysis pass over a function and produce a dominator tree.
- UniformityInfo run(Function &F, FunctionAnalysisManager &);
+ LLVM_ABI UniformityInfo run(Function &F, FunctionAnalysisManager &);
// TODO: verify analysis
};
@@ -46,15 +47,15 @@ class UniformityInfoPrinterPass
raw_ostream &OS;
public:
- explicit UniformityInfoPrinterPass(raw_ostream &OS);
+ LLVM_ABI explicit UniformityInfoPrinterPass(raw_ostream &OS);
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
static bool isRequired() { return true; }
};
/// Legacy analysis pass which computes a \ref CycleInfo.
-class UniformityInfoWrapperPass : public FunctionPass {
+class LLVM_ABI UniformityInfoWrapperPass : public FunctionPass {
Function *m_function = nullptr;
UniformityInfo m_uniformityInfo;
diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h
index da886a508..d68414c89 100644
--- a/llvm/include/llvm/IR/User.h
+++ b/llvm/include/llvm/IR/User.h
@@ -18,6 +18,7 @@
#ifndef LLVM_IR_USER_H
#define LLVM_IR_USER_H
+#include "llvm/include/llvm/Support/Compiler.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/Use.h"
@@ -142,7 +143,7 @@ protected:
protected:
// Use deleteValue() to delete a generic User.
- ~User();
+ LLVM_ABI ~User();
public:
User(const User &) = delete;
diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index 4318945bd..3a7f291c8 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -78,6 +78,7 @@
#ifndef LLVM_TRANSFORMS_SCALAR_NARYREASSOCIATE_H
#define LLVM_TRANSFORMS_SCALAR_NARYREASSOCIATE_H
+#include "llvm/include/llvm/Support/Compiler.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/UniformityAnalysis.h"
@@ -102,10 +103,10 @@ class Value;
class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
public:
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
// Glue for old PM.
- bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
+ LLVM_ABI bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
|
72d8c9a to
9b13647
Compare
Now that |
| dbgs() << "NARY: Preferring uniform remainder for GEP index\n"); | ||
| // LHS is uniform, prefer it as remainder - try RHS first | ||
| if (LHS != RHS) { | ||
| if (auto *NewGEP = |
| tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType)) | ||
| return NewGEP; | ||
| } | ||
| if (auto *NewGEP = |
| // For uniformity: prefer the remaining calculation to be uniform, as it | ||
| // can then stay in scalar registers. |
There was a problem hiding this comment.
This is target specific and not strictly true
| return NewGEP; | ||
| } | ||
| if (auto *NewGEP = | ||
| tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) |
There was a problem hiding this comment.
Why is the commuted case guarded by LHS != RHS, and not this one? This could also be moved above the UI checks
This patch integrates
UniformityAnalysisinto theNaryReassociatepass to prefer reassociations that group uniform values together. This optimization benefits GPU targets (like AMDGPU) by allowing intermediate results to stay in scalar registers (SGPRs), reducing vector register (VGPR) pressure.For an expression I = (A op B) op RHS, NaryReassociate can form either:
When both options have dominating expressions available, the pass previously chose arbitrarily based on operand order. With this patch, when
UniformityAnalysisindicates that B and RHS are uniform but A is divergent, the pass prefers (B op RHS) op A so the sub-expression (B op RHS) can be computed entirely in scalar registers.It is supported for:
Ref: SWDEV-500808