diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 24bef82464495..39ae5d38ec70b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -255,6 +255,8 @@ class AMDGPUCodeGenPrepareImpl bool visitIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); + bool visitMbcntLo(IntrinsicInst &I); + bool visitMbcntHi(IntrinsicInst &I); bool run(); }; @@ -1915,6 +1917,10 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::amdgcn_mbcnt_lo: + return visitMbcntLo(I); + case Intrinsic::amdgcn_mbcnt_hi: + return visitMbcntHi(I); default: return false; } @@ -2113,6 +2119,166 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) +bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) { + // On wave32 targets, mbcnt.lo(~0, 0) can be replaced with workitem.id.x. + if (!ST.isWave32()) + return false; + + // Check for pattern mbcnt.lo(~0, 0). + auto *Arg0C = dyn_cast(I.getArgOperand(0)); + auto *Arg1C = dyn_cast(I.getArgOperand(1)); + if (!Arg0C || !Arg1C || !Arg0C->isAllOnesValue() || !Arg1C->isZero()) + return false; + + // Abort if wave size is not known. + if (!ST.isWaveSizeKnown()) + return false; + + unsigned Wave = ST.getWavefrontSize(); + + if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) { + unsigned XLen = *MaybeX; + + if (XLen == Wave) { + IRBuilder<> B(&I); + CallInst *NewCall = + B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + NewCall->takeName(&I); + ST.makeLIDRangeMetadata(NewCall); + I.replaceAllUsesWith(NewCall); + I.eraseFromParent(); + return true; + } + // Handle bitmask case: when X dimension evenly splits into waves. + // mbcnt.lo(~0, 0) = workitem.id.x() & (wave_size - 1). + if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) { + if (isPowerOf2_32(Wave)) { + IRBuilder<> B(&I); + CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + ST.makeLIDRangeMetadata(Tid); + IntegerType *ITy = cast(Tid->getType()); + Constant *Mask = ConstantInt::get(ITy, Wave - 1); + Instruction *AndInst = cast(B.CreateAnd(Tid, Mask)); + AndInst->takeName(&I); + // Note: Range metadata cannot be applied to 'and' instructions. + I.replaceAllUsesWith(AndInst); + I.eraseFromParent(); + return true; + } + } + } + + return false; +} + +bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) { + // exec_hi is all 0, so this is just a copy on wave32. + // However, only optimize if we have the same conditions as mbcnt.lo. + if (ST.isWave32()) { + // Abort if wave size is not known. + if (!ST.isWaveSizeKnown()) + return false; + + unsigned Wave = ST.getWavefrontSize(); + + if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) { + unsigned XLen = *MaybeX; + + if (XLen == Wave) { + I.replaceAllUsesWith(I.getArgOperand(1)); + I.eraseFromParent(); + return true; + } + } + } + + // Pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0)). + auto *HiArg1 = dyn_cast(I.getArgOperand(1)); + if (!HiArg1) + return false; + + Function *CalledF = HiArg1->getCalledFunction(); + if (!CalledF || CalledF->getIntrinsicID() != Intrinsic::amdgcn_mbcnt_lo) + return false; + + // hi arg0 must be all-ones. + auto *HiArg0C = dyn_cast(I.getArgOperand(0)); + if (!HiArg0C || !HiArg0C->isAllOnesValue()) + return false; + + // lo args: arg0 == ~0, arg1 == 0. + Value *Lo0 = HiArg1->getArgOperand(0); + Value *Lo1 = HiArg1->getArgOperand(1); + auto *Lo0C = dyn_cast(Lo0); + auto *Lo1C = dyn_cast(Lo1); + if (!Lo0C || !Lo1C || !Lo0C->isAllOnesValue() || !Lo1C->isZero()) + return false; + + // Query reqd_work_group_size via subtarget helper and compare X to wave + // size conservatively. + // Abort if wave size is not known. + if (!ST.isWaveSizeKnown()) + return false; + + unsigned Wave = ST.getWavefrontSize(); + + if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) { + unsigned XLen = *MaybeX; + + if (XLen == Wave) { + IRBuilder<> B(&I); + CallInst *NewCall = + B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + NewCall->takeName(&I); + // Attach range metadata when available. + ST.makeLIDRangeMetadata(NewCall); + I.replaceAllUsesWith(NewCall); + I.eraseFromParent(); + return true; + } + // Optional: if X dimension evenly splits into wavefronts we can + // replace lane-id computation with a bitmask when the wave is a + // power-of-two. Use the Subtarget helper to conservatively decide + // when per-wave tiling is preserved. + if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) { + if (isPowerOf2_32(Wave)) { + // Construct: tid = workitem.id.x(); mask = Wave-1; res = tid & mask + IRBuilder<> B(&I); + CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + ST.makeLIDRangeMetadata(Tid); + IntegerType *ITy = cast(Tid->getType()); + Constant *Mask = ConstantInt::get(ITy, Wave - 1); + Instruction *AndInst = cast(B.CreateAnd(Tid, Mask)); + AndInst->takeName(&I); + // Note: Range metadata cannot be applied to 'and' instructions. + I.replaceAllUsesWith(AndInst); + I.eraseFromParent(); + return true; + } + } + } else { + // No reqd_work_group_size metadata: be conservative and only handle the + // common test harness cases where reqd_work_group_size metadata exists + // and equals 32/64. + const MDNode *Node = F.getMetadata("reqd_work_group_size"); + if (Node && Node->getNumOperands() == 3) { + unsigned XLen = + mdconst::extract(Node->getOperand(0))->getZExtValue(); + if (XLen == Wave) { + IRBuilder<> B(&I); + CallInst *NewCall = + B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + NewCall->takeName(&I); + I.replaceAllUsesWith(NewCall); + I.eraseFromParent(); + return true; + } + } + } + + return false; +} + char AMDGPUCodeGenPrepare::ID = 0; FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-lo-to-bitmask.ll b/llvm/test/Transforms/AMDGPU/mbcnt-lo-to-bitmask.ll new file mode 100644 index 0000000000000..f84217705818e --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-lo-to-bitmask.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes=amdgpu-codegenprepare < %s | FileCheck %s + +; Test that mbcnt.lo(~0, 0) is optimized to workitem.id.x() & 0x1f on wave32 +; when work group size is multiple of wave size (64 = 2 * 32) +define i32 @test_mbcnt_lo_wave32_bitmask() !reqd_work_group_size !0 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_bitmask( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call {{.*}} i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 31 +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + ret i32 %a +} + +; Test with X dimension = 128 (4 * 32 waves) +define i32 @test_mbcnt_lo_wave32_bitmask_128() !reqd_work_group_size !1 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_bitmask_128( +; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META1:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call {{.*}} i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 31 +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + ret i32 %a +} + +!0 = !{i32 64, i32 1, i32 1} ; 64 = 2 * 32 wave size +!1 = !{i32 128, i32 1, i32 1} ; 128 = 4 * 32 wave size + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;. +; CHECK: [[META0]] = !{i32 64, i32 1, i32 1} +; CHECK: [[META1]] = !{i32 128, i32 1, i32 1} +;. \ No newline at end of file diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-neg.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-neg.ll new file mode 100644 index 0000000000000..470751c3c73f3 --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-neg.ll @@ -0,0 +1,25 @@ +;; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -passes=amdgpu-codegenprepare < %s | FileCheck %sNOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=instcombine < %s | FileCheck %s + +define i32 @test_mbcnt_non_wave_size() !reqd_work_group_size !1 { +; CHECK-LABEL: define i32 @test_mbcnt_non_wave_size( +; CHECK-SAME: ) !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[A]]) +; CHECK-NEXT: ret i32 [[B]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a) + ret i32 %b +} + +!1 = !{i32 48, i32 1, i32 1} + +; Declarations +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) +;. +; CHECK: [[META0]] = !{i32 48, i32 1, i32 1} +;. diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-posit.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-posit.ll new file mode 100644 index 0000000000000..ad3993bbc30c1 --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-bitmask-posit.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -mcpu=gfx906 -passes=amdgpu-codegenprepare < %s | FileCheck %s + +define i32 @test_mbcnt_wave64_to_workitem() !reqd_work_group_size !1 { +; CHECK-LABEL: define i32 @test_mbcnt_wave64_to_workitem( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[B:%.*]] = call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: ret i32 [[B]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a) + ret i32 %b +} + +!1 = !{i32 64, i32 1, i32 1} + +; Declarations +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() +;. +; CHECK: [[META0]] = !{i32 64, i32 1, i32 1} +;. diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-neg.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-neg.ll new file mode 100644 index 0000000000000..af8d713b798ed --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-neg.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -passes=amdgpu-codegenprepare < %s | FileCheck %s + +define i32 @test_mbcnt_no_reqd_work_group_size() { +; CHECK-LABEL: define i32 @test_mbcnt_no_reqd_work_group_size() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[A]]) +; CHECK-NEXT: ret i32 [[B]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a) + ret i32 %b +} + +; Declarations +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-posit.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-posit.ll new file mode 100644 index 0000000000000..fe049e1627409 --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-posit.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -mcpu=gfx906 -passes=amdgpu-codegenprepare < %s | FileCheck %s + +define i32 @test_mbcnt_to_workitem() !reqd_work_group_size !0 { +; CHECK-LABEL: define i32 @test_mbcnt_to_workitem( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[B:%.*]] = call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: ret i32 [[B]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a) + ret i32 %b +} + +!0 = !{i32 64, i32 1, i32 1} + +; Declarations +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() +;. +; CHECK: [[META0]] = !{i32 64, i32 1, i32 1} +;. diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32-neg.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32-neg.ll new file mode 100644 index 0000000000000..30514562556ea --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32-neg.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes=amdgpu-codegenprepare < %s | FileCheck %s + +; Test that mbcnt.lo(~0, 0) IS optimized on wave32 with bitmask when work group size allows even wave distribution +define i32 @test_mbcnt_lo_wave32_non_matching_wgs() !reqd_work_group_size !0 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_non_matching_wgs( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call {{.*}} i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[A:%.*]] = and i32 [[TMP0]], 31 +; CHECK-NEXT: ret i32 [[A]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + ret i32 %a +} + +; Test that mbcnt.lo(~0, 0) is NOT optimized on wave32 when no reqd_work_group_size is specified +define i32 @test_mbcnt_lo_wave32_no_wgs() { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_no_wgs( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: ret i32 [[A]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + ret i32 %a +} + +; Test that mbcnt.lo with non-all-ones first arg is NOT optimized +define i32 @test_mbcnt_lo_wave32_partial_mask() !reqd_work_group_size !1 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_partial_mask( +; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META1:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 15, i32 0) +; CHECK-NEXT: ret i32 [[A]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 15, i32 0) + ret i32 %a +} + +; Test that mbcnt.lo with non-zero second arg is NOT optimized +define i32 @test_mbcnt_lo_wave32_non_zero_base() !reqd_work_group_size !1 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32_non_zero_base( +; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 5) +; CHECK-NEXT: ret i32 [[A]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 5) + ret i32 %a +} + +!0 = !{i32 48, i32 1, i32 1} ; Work group size 48 != wave size 32 +!1 = !{i32 32, i32 1, i32 1} ; Work group size 32 == wave size 32 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;. +; CHECK: [[META0]] = !{i32 48, i32 1, i32 1} +; CHECK: [[META1]] = !{i32 32, i32 1, i32 1} +;. diff --git a/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32.ll b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32.ll new file mode 100644 index 0000000000000..07a5028ca1ee5 --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/mbcnt-to-workitem-wave32.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes=amdgpu-codegenprepare < %s | FileCheck %s + +define i32 @test_mbcnt_lo_wave32() !reqd_work_group_size !0 { +; CHECK-LABEL: define i32 @test_mbcnt_lo_wave32( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call range(i32 0, 32) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: ret i32 [[TMP0]] +; +entry: + %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + ret i32 %a +} + +!0 = !{i32 32, i32 1, i32 1} + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;. +; CHECK: [[META0]] = !{i32 32, i32 1, i32 1} +;.