Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1

let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
>;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
Expand Down Expand Up @@ -1884,6 +1886,13 @@ def : GCNPat <
(S_MOV_B32 (i32 0)), sub1))
>;

def : GCNPat <
(i64 (UniformBinFrag<sub> 64, (ctpop i64:$src))),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the COPY_TO_REGCLASS really necessary? I know we had a tablegen workaround of the same shape around, but I'm not sure it's still necessary

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm I have no clue. I put it there because I was aping similar code in that file that seemed to use it when copying from 32-bit to 64-bit. I can remove it and see if it still works?

(S_MOV_B32 (i32 0)), sub1))
>;

def : GCNPat <
(i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)
Expand Down
110 changes: 110 additions & 0 deletions llvm/test/CodeGen/AMDGPU/s_bcnt0.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a GlobalISel run line to check that the patterns work there too?

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad, the negative tests crash Global ISel, so I can't add a check unless I break out the positive tests to a separate file. I'll do that if you like, but I think a better approach would be to file a JIRA issue to look into that.


define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; CHECK-LABEL: bcnt032_not_for_vregs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v3
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
%val0 = load volatile i32, ptr addrspace(1) %gep
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result2)
%cmp = icmp ne i32 %result2, 0
%zext = zext i1 %cmp to i32
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; CHECK-LABEL: bcnt064_not_for_vregs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
; CHECK-NEXT: v_bcnt_u32_b32 v3, v3, v2
; CHECK-NEXT: v_sub_co_u32_e32 v5, vcc, 64, v3
; CHECK-NEXT: v_subb_co_u32_e64 v6, s[0:1], 0, 0, vcc
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v[5:6]
; CHECK-NEXT: ;;#ASMEND
Comment on lines +44 to +46
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh this is bad bug. Your SGPR constraint was lost and silently transmuted into a VGPR. Not related to this PR though, for your purposes you're just using an overly complicated test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yikes! Do you want to file the issue or should I?

; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
; CHECK-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
%val0 = load volatile i64, ptr addrspace(1) %gep
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

volatile load is ineligible for the VALU load to scalar load optimization. For your purposes, it is simpler to use an inreg argument to the shader calling convention rather than all of this boilerplate to load the value from memory

%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result2)
%cmp = icmp ne i64 %result2, 0
%zext = zext i1 %cmp to i32
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: ; return to shader part epilog
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result)
call void asm "; use $0", "s"(i32 %result2)
%cmp = icmp ne i32 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}

define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s3, 0
; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, s3
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result = call i64 @llvm.ctpop.i64(i64 %val0)

Don't need the callsite attributes

%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result)
call void asm "; use $0", "s"(i64 %result2)
%cmp = icmp ne i64 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,7 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
; CHECK-LABEL: bcnt032:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
; CHECK-NEXT: s_sub_i32 s0, 32, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
Expand All @@ -465,9 +463,8 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
; CHECK-LABEL: bcnt064:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_sub_u32 s0, 64, s0
; CHECK-NEXT: s_subb_u32 s1, 0, 0
; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
Expand Down