diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h index f0474bc4352e3..4318945bd82b3 100644 --- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h +++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h @@ -80,6 +80,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" @@ -106,7 +107,7 @@ class NaryReassociatePass : public PassInfoMixin { // Glue for old PM. bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_, ScalarEvolution *SE_, TargetLibraryInfo *TLI_, - TargetTransformInfo *TTI_); + TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr); private: // Runs only one iteration of the dominator-based algorithm. See the header @@ -183,6 +184,7 @@ class NaryReassociatePass : public PassInfoMixin { ScalarEvolution *SE; TargetLibraryInfo *TLI; TargetTransformInfo *TTI; + UniformityInfo *UI; // A lookup table quickly telling which instructions compute the given SCEV. // Note that there can be multiple instructions at different locations diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index ec145f2f48bea..6208c24c346d1 100644 --- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -104,6 +104,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -141,6 +142,7 @@ class NaryReassociateLegacyPass : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } @@ -159,6 +161,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate", "Nary reassociation", false, false) @@ -176,7 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) { auto *TLI = &getAnalysis().getTLI(F); auto *TTI = &getAnalysis().getTTI(F); - return Impl.runImpl(F, AC, DT, SE, TLI, TTI); + // UniformityInfo is required on all targets, but on targets without branch + // divergence it does no work and reports everything as uniform. + auto *UI = &getAnalysis().getUniformityInfo(); + + return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI); } PreservedAnalyses NaryReassociatePass::run(Function &F, @@ -187,7 +194,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F, auto *TLI = &AM.getResult(F); auto *TTI = &AM.getResult(F); - if (!runImpl(F, AC, DT, SE, TLI, TTI)) + // UniformityInfo is required on all targets, but on targets without branch + // divergence it does no work and reports everything as uniform. + auto *UI = &AM.getResult(F); + + if (!runImpl(F, AC, DT, SE, TLI, TTI, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -199,12 +210,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F, bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_, ScalarEvolution *SE_, TargetLibraryInfo *TLI_, - TargetTransformInfo *TTI_) { + TargetTransformInfo *TTI_, + UniformityInfo *UI_) { AC = AC_; DT = DT_; SE = SE_; TLI = TLI_; TTI = TTI_; + UI = UI_; DL = &F.getDataLayout(); bool Changed = false, ChangedInThisIteration; @@ -379,6 +392,33 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); // IndexToSplit = LHS + RHS. + // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating + // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale. + // So the RHS becomes the "remaining" index calculation. + // + // For uniformity: prefer the remaining calculation to be uniform, as it + // can then stay in scalar registers. + // + // Default order tries LHS first (RHS as remainder). If LHS is uniform and + // RHS is divergent, we want to try RHS first so uniform LHS becomes the + // remainder. The case where RHS is uniform and LHS is divergent is already + // handled by the default order. + if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) { + LLVM_DEBUG( + dbgs() << "NARY: Preferring uniform remainder for GEP index\n"); + // LHS is uniform, prefer it as remainder - try RHS first + if (LHS != RHS) { + if (auto *NewGEP = + tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType)) + return NewGEP; + } + if (auto *NewGEP = + tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) + return NewGEP; + return nullptr; + } + + // Default order if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) return NewGEP; // Symmetrically, try IndexToSplit = RHS + LHS. @@ -483,6 +523,39 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS, // = (A op RHS) op B or (B op RHS) op A const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); const SCEV *RHSExpr = SE->getSCEV(RHS); + + // When uniformity analysis is available (e.g., on GPU targets), prefer + // reassociations that group uniform values together. This allows + // intermediate results to stay in scalar registers (SGPRs on AMDGPU), + // reducing vector register (VGPR) pressure. + // + // For I = (A op B) op RHS, we can form: + // - (A op RHS) op B: groups A and RHS + // - (B op RHS) op A: groups B and RHS + // + // Prefer the grouping where both operands in the new sub-expression are + // uniform, as this sub-expression can then be computed in scalar registers. + // + // We only need to handle the case where B and RHS are uniform but A is + // divergent. The symmetric case (A and RHS uniform, B divergent) is already + // handled by the default order which tries (A op RHS) op B first. + if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) { + LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I + << "\n"); + if (AExpr != RHSExpr) { + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I)) + return NewI; + } + if (BExpr != RHSExpr) { + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) + return NewI; + } + return nullptr; + } + + // Default order: try (A op RHS) op B first if (BExpr != RHSExpr) { if (auto *NewI = tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) @@ -653,6 +726,27 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I, const SCEV *BExpr = SE->getSCEV(B); const SCEV *RHSExpr = SE->getSCEV(RHS); + // Similar to binary ops, prefer grouping uniform values together when + // uniformity analysis is available. + // For I = minmax(minmax(A, B), RHS), we can form: + // - minmax(minmax(A, RHS), B): groups A and RHS + // - minmax(minmax(B, RHS), A): groups B and RHS + if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) { + LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I + << "\n"); + // Try (B op RHS) op A first - groups uniform B with uniform RHS + if (AExpr != RHSExpr) { + if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr)) + return NewMinMax; + } + if (BExpr != RHSExpr) { + if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr)) + return NewMinMax; + } + return nullptr; + } + + // Default order if (BExpr != RHSExpr) { // Try (A op RHS) op B if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr)) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6940c1b238e1d..61e8ad30dc44f 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -515,9 +515,10 @@ ; GCN-O1-OPTS-NEXT: Straight line strength reduction ; GCN-O1-OPTS-NEXT: Early CSE ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Nary reassociation ; GCN-O1-OPTS-NEXT: Early CSE -; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) @@ -831,9 +832,10 @@ ; GCN-O2-NEXT: Straight line strength reduction ; GCN-O2-NEXT: Early CSE ; GCN-O2-NEXT: Scalar Evolution Analysis +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Nary reassociation ; GCN-O2-NEXT: Early CSE -; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) @@ -1163,9 +1165,10 @@ ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering ; GCN-O3-NEXT: Scalar Evolution Analysis +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Nary reassociation ; GCN-O3-NEXT: Early CSE -; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll new file mode 100644 index 0000000000000..487a6d9270121 --- /dev/null +++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; REQUIRES: amdgpu-registered-target +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='nary-reassociate' -S | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare void @use(i32) + +; Test that NaryReassociate prefers grouping uniform values together when +; uniformity analysis is available and both reassociation options exist. +; +; For I = (A op B) op RHS, the pass can form: +; - (A op RHS) op B +; - (B op RHS) op A +; +; When both dominating expressions exist, prefer the one grouping uniforms. + +; Both %d_u2 and %u1_u2 exist as dominating expressions. +; For (d + u1) + u2: +; - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1 +; - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent +; So prefer (u1 + u2) + d, returning %u1_u2 + d +; + +define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_U2:%.*]] = add i32 [[D]], [[U2]] +; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]] +; CHECK-NEXT: call void @use(i32 [[D_U2]]) +; CHECK-NEXT: call void @use(i32 [[U1_U2]]) +; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]] +; CHECK-NEXT: call void @use(i32 [[RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + ; Create both possible reassociation targets + %d_u2 = add i32 %d, %u2 ; divergent + uniform + %u1_u2 = add i32 %u1, %u2 ; uniform + uniform (should be preferred!) + + call void @use(i32 %d_u2) + call void @use(i32 %u1_u2) + + ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist + ; Should prefer (u1 + u2) + d to group uniforms + %tmp = add i32 %d, %u1 + %result = add i32 %tmp, %u2 + call void @use(i32 %result) + + ret void +} + +define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_U2:%.*]] = mul i32 [[D]], [[U2]] +; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]] +; CHECK-NEXT: call void @use(i32 [[D_U2]]) +; CHECK-NEXT: call void @use(i32 [[U1_U2]]) +; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]] +; CHECK-NEXT: call void @use(i32 [[RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + %d_u2 = mul i32 %d, %u2 + %u1_u2 = mul i32 %u1, %u2 + + call void @use(i32 %d_u2) + call void @use(i32 %u1_u2) + + %tmp = mul i32 %d, %u1 + %result = mul i32 %tmp, %u2 + call void @use(i32 %result) + + ret void +} + +define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @only_one_option( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]] +; CHECK-NEXT: call void @use(i32 [[U1_U2]]) +; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]] +; CHECK-NEXT: call void @use(i32 [[RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + ; Only u1 + u2 exists, not d + u2 + %u1_u2 = add i32 %u1, %u2 + call void @use(i32 %u1_u2) + + %tmp = add i32 %d, %u1 + %result = add i32 %tmp, %u2 + call void @use(i32 %result) + + ret void +} + +; When no dominating expression exists, no reassociation happens +define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[D]], [[U1]] +; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[TMP]], [[U2]] +; CHECK-NEXT: call void @use(i32 [[RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + ; No dominating expressions exist + %tmp = add i32 %d, %u1 + %result = add i32 %tmp, %u2 + call void @use(i32 %result) + + ret void +} + +; Test smax: prefer grouping uniform values together +; For smax(smax(A, B), RHS): +; - smax(smax(A, RHS), B): groups A and RHS +; - smax(smax(B, RHS), A): groups B and RHS +; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A) +define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]]) +; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]]) +; CHECK-NEXT: call void @use(i32 [[D_U2]]) +; CHECK-NEXT: call void @use(i32 [[U1_U2]]) +; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]]) +; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + ; Create both possible reassociation targets + %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2) ; divergent, uniform + %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2) ; uniform, uniform (preferred!) + + call void @use(i32 %d_u2) + call void @use(i32 %u1_u2) + + ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist + ; Should prefer smax(smax(u1, u2), d) to group uniforms + %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1) + %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2) + call void @use(i32 %result) + + ret void +} + +; Test umin: prefer grouping uniform values together +define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin( +; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]]) +; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]]) +; CHECK-NEXT: call void @use(i32 [[D_U2]]) +; CHECK-NEXT: call void @use(i32 [[U1_U2]]) +; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]]) +; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + + %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2) + %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2) + + call void @use(i32 %d_u2) + call void @use(i32 %u1_u2) + + %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1) + %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2) + call void @use(i32 %result) + + ret void +} + +; Test GEP with LHS=uniform, RHS=divergent +define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(ptr %base, i64 %u_offset) { +; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent( +; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64 +; CHECK-NEXT: [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]] +; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U]]) +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]]) +; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + %d_ext = zext i32 %d to i64 + + ; Create BOTH dominating GEPs so there's a choice + %gep_u = getelementptr i32, ptr %base, i64 %u_offset ; uniform index + %gep_d = getelementptr i32, ptr %base, i64 %d_ext ; divergent index + + call void @use_ptr(ptr %gep_u) + call void @use_ptr(ptr %gep_d) + + ; idx = u_offset + d_ext (LHS=uniform, RHS=divergent) + %idx = add i64 %u_offset, %d_ext + %gep_result = getelementptr i32, ptr %base, i64 %idx + call void @use_ptr(ptr %gep_result) + + ret void +} + +; Test GEP with LHS=divergent, RHS=uniform +define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform(ptr %base, i64 %u_offset) { +; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform( +; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) { +; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64 +; CHECK-NEXT: [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]] +; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U]]) +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]]) +; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]]) +; CHECK-NEXT: ret void +; + %d = call i32 @llvm.amdgcn.workitem.id.x() + %d_ext = zext i32 %d to i64 + + ; Create BOTH dominating GEPs so there's a choice + %gep_u = getelementptr i32, ptr %base, i64 %u_offset ; uniform index + %gep_d = getelementptr i32, ptr %base, i64 %d_ext ; divergent index + + call void @use_ptr(ptr %gep_u) + call void @use_ptr(ptr %gep_d) + + ; idx = d_ext + u_offset (LHS=divergent, RHS=uniform) + %idx = add i64 %d_ext, %u_offset + %gep_result = getelementptr i32, ptr %base, i64 %idx + call void @use_ptr(ptr %gep_result) + + ret void +} + +; Test GEP with both LHS and RHS uniform - no preference needed +define amdgpu_kernel void @gep_both_uniform(ptr %base, i64 %u1, i64 %u2) { +; CHECK-LABEL: define amdgpu_kernel void @gep_both_uniform( +; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U1:%.*]], i64 [[U2:%.*]]) { +; CHECK-NEXT: [[GEP_U1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U1]] +; CHECK-NEXT: [[GEP_U2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U2]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U1]]) +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U2]]) +; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U1]], i64 [[U2]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]]) +; CHECK-NEXT: ret void +; + ; Create both dominating GEPs with uniform indices + %gep_u1 = getelementptr i32, ptr %base, i64 %u1 + %gep_u2 = getelementptr i32, ptr %base, i64 %u2 + + call void @use_ptr(ptr %gep_u1) + call void @use_ptr(ptr %gep_u2) + + ; idx = u1 + u2 (both uniform - no preference needed) + %idx = add i64 %u1, %u2 + %gep_result = getelementptr i32, ptr %base, i64 %idx + call void @use_ptr(ptr %gep_result) + + ret void +} + +; Test GEP with both LHS and RHS divergent - no preference needed +define amdgpu_kernel void @gep_both_divergent(ptr %base) { +; CHECK-LABEL: define amdgpu_kernel void @gep_both_divergent( +; CHECK-SAME: ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[D1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[D2:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[D1_EXT:%.*]] = zext i32 [[D1]] to i64 +; CHECK-NEXT: [[D2_EXT:%.*]] = zext i32 [[D2]] to i64 +; CHECK-NEXT: [[GEP_D1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D1_EXT]] +; CHECK-NEXT: [[GEP_D2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D2_EXT]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D1]]) +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D2]]) +; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D1]], i64 [[D2_EXT]] +; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]]) +; CHECK-NEXT: ret void +; + %d1 = call i32 @llvm.amdgcn.workitem.id.x() + %d2 = call i32 @llvm.amdgcn.workitem.id.y() + %d1_ext = zext i32 %d1 to i64 + %d2_ext = zext i32 %d2 to i64 + + ; Create both dominating GEPs with divergent indices + %gep_d1 = getelementptr i32, ptr %base, i64 %d1_ext + %gep_d2 = getelementptr i32, ptr %base, i64 %d2_ext + + call void @use_ptr(ptr %gep_d1) + call void @use_ptr(ptr %gep_d2) + + ; idx = d1_ext + d2_ext (both divergent - no preference needed) + %idx = add i64 %d1_ext, %d2_ext + %gep_result = getelementptr i32, ptr %base, i64 %idx + call void @use_ptr(ptr %gep_result) + + ret void +} + +declare i32 @llvm.smax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.umin.i32(i32, i32) +declare void @use_ptr(ptr)