[AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 #164847

linuxrocks123 · 2025-10-23T16:57:01Z

This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Clang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern.

llvmbot · 2025-10-23T19:21:37Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang

Author: Patrick Simmons (linuxrocks123)

Changes

This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Blang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern.

Full diff: https://github.com/llvm/llvm-project/pull/164847.diff

5 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+8)
(modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+43)
(modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+6-2)
(modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (+17-21)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8428fa97fe445..f17156f8a24ab 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
 
+BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+
 TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9e334d4316336..50b43a1c927ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem]>;
 
+def int_amdgcn_bcnt032_lo :
+  ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_amdgcn_bcnt064_lo :
+  ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
 // llvm.amdgcn.ds.swizzle src offset
 def int_amdgcn_ds_swizzle :
   ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba77d69aa..39b558694edf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
@@ -35,6 +36,7 @@
 #include "llvm/Support/KnownFPClass.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cstdint>
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
 
@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool>
+    DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
+                 cl::desc("Prevent transforming bitsin(typeof(x)) - "
+                          "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
+                 cl::ReallyHidden, cl::init(false));
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
@@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
+  bool visitCtpop(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
   bool visitSqrt(IntrinsicInst &I);
   bool run();
@@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
     return visitFMinLike(I);
   case Intrinsic::sqrt:
     return visitSqrt(I);
+  case Intrinsic::ctpop:
+    return visitCtpop(I);
   default:
     return false;
   }
@@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
   return insertValues(Builder, FractArg->getType(), ResultVals);
 }
 
+bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
+  uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
+  if (!I.hasOneUse() ||
+      !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+    return false;
+
+  BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
+  if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+    return false;
+
+  ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
+  if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+    return false;
+
+  IRBuilder<> Builder(MustBeSub);
+  Instruction *TransformedIns =
+      Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
+                                            : Intrinsic::amdgcn_bcnt032_lo,
+                              {}, {I.getArgOperand(0)});
+
+  if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
+      (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
+    TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+        TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+
+  MustBeSub->replaceAllUsesWith(TransformedIns);
+  TransformedIns->takeName(MustBeSub);
+  MustBeSub->eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
   Value *FractArg = matchFractPat(I);
   if (!FractArg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..29104d33a8aa8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
 } // End isReMaterializable = 1, isAsCheapAsAMove = 1
 
 let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+  [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
+  [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+>;
 def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
   [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
 >;
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..db030d2b19d90 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
 define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt032:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_sub_i32 s0, 32, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
   %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %result2 = sub i32 32, %result
   call void asm "; use $0", "s"(i32 %result2)
@@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
 define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
 ; CHECK-LABEL: bcnt064:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; CHECK-NEXT:    s_sub_u32 s0, 64, s0
-; CHECK-NEXT:    s_subb_u32 s1, 0, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:1]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
   %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
   %result2 = sub i64 64, %result
   call void asm "; use $0", "s"(i64 %result2)

LU-JOHN · 2025-10-23T19:39:10Z

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
  cl::ReallyHidden,
  cl::init(false));

+// Disable processing of fdiv so we can better test the backend implementations.


Comment needs to be updated.

LU-JOHN · 2025-10-23T20:03:00Z

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+def int_amdgcn_bcnt032_lo :
+  ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,


Is "bcnt032_lo" the name we want to use? For comparison:

ClangBuiltin<"__builtin_amdgcn_sad_u8">, ClangBuiltin<"__builtin_amdgcn_msad_u8">,

follows the mnemonic without an initial "v_".

@LU-JOHN I removed the 0, which I feel makes them close to the existing __builtin_amdgcn_mbcnt_lo. What do you think?

Re the option, I think most of the other transformations have options, so I feel this one should as well.

github-actions · 2025-10-23T23:31:12Z

✅ With the latest revision this PR passed the C/C++ code formatter.

arsenm

This should not introduce a builtin or intrinsic. This can be purely done in tablegen patterns without the intermediate step

arsenm · 2025-10-24T02:48:41Z

llvm/test/CodeGen/AMDGPU/s_cmp_0.ll

+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
  %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone


Seems like missing tests? What happens in VALU cases? Negative tests for multiple uses of the popcnt?

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

clang/include/clang/Basic/BuiltinsAMDGPU.def

- Add tests - Remove builtin (users will need inline assembly if pattern match fails)

arsenm

This should not introduce a new intrinsic and only needs a tablegen pattern. The one benefit you get out doing this fold in the IR would be sinking a popcnt out of block, but you can do that just by handling this case in isProfitableToSinkOperands, and that avoids teaching all of the known bits / sign bits / simplify demanded bits about this

arsenm · 2025-10-25T11:09:51Z

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
            [IntrNoMem]>;

+def int_amdgcn_bcnt32_lo :


This does not need an intrinsic. This doesn't help match the pattern in any cases, and introduces new support burdens to every optimization

Doesn't seem to be removed?

@jayfoad there is no change to Clang introducing an intrinsic now. What are you referring to?

@arsenm were you referring to the Clang intrinsic or all code in IntrinsicsAMDGPU.td? If you do not want this file changed, how can I perform the optimization in AMDGPUCodeGenPrepare.cpp without referring to Intrinsic::amdgcn_bcnt32_lo and Intrinsic::amdgcn_bcnt64_lo?

"Intrinsic" means the LLVM IR special function llvm.amdgcn.bcnt32.lo(). "Builtin" means the Clang/C++ special function __builtin_amdgcn_bcnt32_lo().

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

jayfoad

Looks good overall.

jayfoad · 2025-10-29T09:44:21Z

llvm/lib/Target/AMDGPU/SOPInstructions.td

@@ -1884,6 +1886,13 @@ def : GCNPat <
     (S_MOV_B32 (i32 0)), sub1))
 >;

+def : GCNPat <
+   (i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))),


jayfoad · 2025-10-29T09:45:18Z

llvm/lib/Target/AMDGPU/SOPInstructions.td

@@ -264,7 +264,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
 } // End isReMaterializable = 1, isAsCheapAsAMove = 1

 let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+  [(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]


You don't need both UniformFrags. Standard practice is to put it only on the outermost expression:

Suggested change

[(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]

[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]

(In this particular case I think it would also work to put it only on the inner expression, but let's not do that.)

jayfoad · 2025-10-29T09:48:23Z

llvm/test/CodeGen/AMDGPU/s_cmp_0.ll

+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0


As a follow up, it would be good if we could somehow generate a 32-bit compare instead here:

Suggested change

; CHECK-NEXT: s_mov_b32 s1, 0

; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0

; CHECK-NEXT: s_cmp_lg_u32 s0, 0

@jayfoad yes, I noticed this, too. I think what we really should do here is reverse the order of the MOV and the BCNT instructions. If we did that, we could eliminate the comparison instruction entirely since BCNT already updates SCC.

I think that work belongs in a separate PR that runs on the Machine IR shortly after ISel. Such a pass would help here, but it may also catch other opportunities unrelated to this one.

LU-JOHN · 2025-10-29T14:43:43Z

llvm/test/CodeGen/AMDGPU/s_cmp_0.ll

@@ -625,3 +622,111 @@ if:
 endif:
  ret i32 1
 }
+
+define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {


s_cmp_0.ll was intended to test deleting redundant s_cmp* sX, 0 instructions. These new bcnt0* tests should be in a different file.

LU-JOHN

LGTM

jayfoad · 2025-10-31T13:28:30Z

llvm/test/CodeGen/AMDGPU/s_bcnt0.ll

+  %cmp = icmp ne i64 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}


Missing newline at end of file

jayfoad · 2025-10-31T13:29:07Z

llvm/test/CodeGen/AMDGPU/s_bcnt0.ll

@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s


Add a GlobalISel run line to check that the patterns work there too?

@jayfoad, the negative tests crash Global ISel, so I can't add a check unless I break out the positive tests to a separate file. I'll do that if you like, but I think a better approach would be to file a JIRA issue to look into that.

arsenm

Description is out of date

arsenm · 2025-11-01T05:33:09Z

llvm/lib/Target/AMDGPU/SOPInstructions.td

+def : GCNPat <
+   (i64 (UniformBinFrag<sub> 64, (ctpop i64:$src))),
+    (i64 (REG_SEQUENCE SReg_64,
+     (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0,


Is the COPY_TO_REGCLASS really necessary? I know we had a tablegen workaround of the same shape around, but I'm not sure it's still necessary

@arsenm I have no clue. I put it there because I was aping similar code in that file that seemed to use it when copying from 32-bit to 64-bit. I can remove it and see if it still works?

arsenm · 2025-11-01T05:34:34Z

llvm/test/CodeGen/AMDGPU/s_bcnt0.ll

+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[5:6]
+; CHECK-NEXT:    ;;#ASMEND


Oh this is bad bug. Your SGPR constraint was lost and silently transmuted into a VGPR. Not related to this PR though, for your purposes you're just using an overly complicated test.

Yikes! Do you want to file the issue or should I?

arsenm · 2025-11-01T05:35:16Z

llvm/test/CodeGen/AMDGPU/s_bcnt0.ll

+; CHECK-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %val0 = load volatile i64, ptr addrspace(1) %gep


volatile load is ineligible for the VALU load to scalar load optimization. For your purposes, it is simpler to use an inreg argument to the shader calling convention rather than all of this boilerplate to load the value from memory

arsenm · 2025-11-01T05:39:37Z

llvm/test/CodeGen/AMDGPU/s_bcnt0.ll

+; CHECK-NEXT:    ; use s[2:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone


Suggested change

%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone

%result = call i64 @llvm.ctpop.i64(i64 %val0)

Don't need the callsite attributes

linuxrocks123 added 2 commits October 23, 2025 11:50

Initial work

ddda647

Update testcases

249ee64

linuxrocks123 marked this pull request as ready for review October 23, 2025 19:20

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Oct 23, 2025

LU-JOHN reviewed Oct 23, 2025

View reviewed changes

linuxrocks123 added 2 commits October 23, 2025 17:31

Don't perform optimization on vector types

5bd7c7b

Review changes

1030ef3

arsenm requested changes Oct 24, 2025

View reviewed changes

jmmartinez reviewed Oct 24, 2025

View reviewed changes

Review changes:

165f82d

- Add tests - Remove builtin (users will need inline assembly if pattern match fails)

arsenm requested changes Oct 25, 2025

View reviewed changes

Reviewer-suggested refactoring

168a5e3

jmmartinez reviewed Oct 28, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Outdated Show resolved Hide resolved

linuxrocks123 added 2 commits October 28, 2025 17:30

Revert implementation

9dd73e6

Use S-expressions instead

c3d205a

jayfoad reviewed Oct 29, 2025

View reviewed changes

LU-JOHN reviewed Oct 29, 2025

View reviewed changes

linuxrocks123 added 2 commits October 29, 2025 12:35

Review changes

c617ef5

Newline

f5776e1

LU-JOHN changed the title ~~Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU~~ [AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 Oct 30, 2025

LU-JOHN approved these changes Oct 30, 2025

View reviewed changes

LU-JOHN requested a review from arsenm October 30, 2025 14:29

jayfoad reviewed Oct 31, 2025

View reviewed changes

Fix testcases

d4d3428

arsenm reviewed Nov 1, 2025

View reviewed changes

		def int_amdgcn_bcnt032_lo :
		ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,

	[(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]
	[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]

		; CHECK-NEXT: s_mov_b32 s1, 0
		; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0

	; CHECK-NEXT: s_mov_b32 s1, 0
	; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
	; CHECK-NEXT: s_cmp_lg_u32 s0, 0

		@@ -0,0 +1,110 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
		; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s \| FileCheck %s

	%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
	%result = call i64 @llvm.ctpop.i64(i64 %val0)

[AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 #164847

Are you sure you want to change the base?

[AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 #164847

Conversation

linuxrocks123 commented Oct 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

linuxrocks123 Oct 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

linuxrocks123 Oct 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

LU-JOHN left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

linuxrocks123 commented Oct 23, 2025 •

edited

Loading

llvmbot commented Oct 23, 2025 •

edited

Loading

github-actions bot commented Oct 23, 2025 •

edited

Loading

linuxrocks123 Oct 28, 2025 •

edited

Loading

linuxrocks123 Oct 29, 2025 •

edited

Loading

linuxrocks123 Oct 31, 2025 •

edited

Loading