-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-selectiondag Author: Wang Pengcheng (wangpc-pp) ChangesWe can avoid libcalls. Fixes #86205 Patch is 110.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86505.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8be03b66e155f6..e0662d57bb4bba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8709,11 +8709,21 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
DAG.getConstant(0xFF, dl, VT));
}
- // v = (v * 0x01010101...) >> (Len - 8)
- SDValue Mask01 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
- return DAG.getNode(ISD::SRL, dl, VT,
- DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+ SDValue V;
+ if (isOperationLegalOrCustomOrPromote(ISD::MUL, VT)) {
+ // v = (v * 0x01010101...) >> (Len - 8)
+ SDValue Mask01 =
+ DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+ V = DAG.getNode(ISD::MUL, dl, VT, Op, Mask01);
+ } else {
+ V = Op;
+ SDValue ShiftC = DAG.getConstant(8, dl, VT);
+ for (unsigned I = 8; I < Len; I += 8) {
+ V = DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getNode(ISD::SHL, dl, VT, V, ShiftC));
+ }
+ }
+ return DAG.getNode(ISD::SRL, dl, VT, V,
DAG.getConstant(Len - 8, dl, ShVT));
}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index 9fa3f5076bb221..c3731fc4f2e19f 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -285,9 +285,12 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
; LA64-NEXT: lu12i.w $a1, 61680
; LA64-NEXT: ori $a1, $a1, 3855
; LA64-NEXT: and $a0, $a0, $a1
-; LA64-NEXT: lu12i.w $a1, 4112
-; LA64-NEXT: ori $a1, $a1, 257
-; LA64-NEXT: mul.d $a0, $a0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 8
+; LA64-NEXT: add.d $a1, $a0, $a1
+; LA64-NEXT: slli.d $a1, $a1, 8
+; LA64-NEXT: add.d $a1, $a0, $a1
+; LA64-NEXT: slli.d $a1, $a1, 8
+; LA64-NEXT: add.d $a0, $a0, $a1
; LA64-NEXT: bstrpick.d $a0, $a0, 31, 24
; LA64-NEXT: ret
%1 = call i32 @llvm.ctpop.i32(i32 %a)
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b396..8533a1d73544f3 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -1160,8 +1160,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: beqz a0, .LBB10_2
; RV32I-NEXT: # %bb.1: # %cond.false
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -1189,61 +1187,63 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
; RV32I-NEXT: and a0, a0, a1
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: call __mulsi3
+; RV32I-NEXT: slli a1, a0, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB10_2:
; RV32I-NEXT: li a0, 32
; RV32I-NEXT: ret
;
-; RV64I-LABEL: test_ctlz_i32:
-; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a1, a0
-; RV64I-NEXT: beqz a1, .LBB10_2
-; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: addi sp, sp, -16
-; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srliw a1, a0, 1
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 16
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: not a0, a0
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 16
-; RV64I-NEXT: ret
-; RV64I-NEXT: .LBB10_2:
-; RV64I-NEXT: li a0, 32
-; RV64I-NEXT: ret
+; RV64NOZBB-LABEL: test_ctlz_i32:
+; RV64NOZBB: # %bb.0:
+; RV64NOZBB-NEXT: sext.w a1, a0
+; RV64NOZBB-NEXT: beqz a1, .LBB10_2
+; RV64NOZBB-NEXT: # %bb.1: # %cond.false
+; RV64NOZBB-NEXT: srliw a1, a0, 1
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 2
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 4
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 8
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 16
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: not a0, a0
+; RV64NOZBB-NEXT: srli a1, a0, 1
+; RV64NOZBB-NEXT: lui a2, 349525
+; RV64NOZBB-NEXT: addiw a2, a2, 1365
+; RV64NOZBB-NEXT: and a1, a1, a2
+; RV64NOZBB-NEXT: sub a0, a0, a1
+; RV64NOZBB-NEXT: lui a1, 209715
+; RV64NOZBB-NEXT: addiw a1, a1, 819
+; RV64NOZBB-NEXT: and a2, a0, a1
+; RV64NOZBB-NEXT: srli a0, a0, 2
+; RV64NOZBB-NEXT: and a0, a0, a1
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: srli a1, a0, 4
+; RV64NOZBB-NEXT: add a0, a0, a1
+; RV64NOZBB-NEXT: lui a1, 61681
+; RV64NOZBB-NEXT: addi a2, a1, -241
+; RV64NOZBB-NEXT: and a2, a0, a2
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: addi a1, a1, -256
+; RV64NOZBB-NEXT: and a0, a0, a1
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: srliw a0, a0, 24
+; RV64NOZBB-NEXT: ret
+; RV64NOZBB-NEXT: .LBB10_2:
+; RV64NOZBB-NEXT: li a0, 32
+; RV64NOZBB-NEXT: ret
;
; RV32M-LABEL: test_ctlz_i32:
; RV32M: # %bb.0:
@@ -1285,47 +1285,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
; RV32M-NEXT: li a0, 32
; RV32M-NEXT: ret
;
-; RV64M-LABEL: test_ctlz_i32:
-; RV64M: # %bb.0:
-; RV64M-NEXT: sext.w a1, a0
-; RV64M-NEXT: beqz a1, .LBB10_2
-; RV64M-NEXT: # %bb.1: # %cond.false
-; RV64M-NEXT: srliw a1, a0, 1
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 2
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 4
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 8
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 16
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: not a0, a0
-; RV64M-NEXT: srli a1, a0, 1
-; RV64M-NEXT: lui a2, 349525
-; RV64M-NEXT: addiw a2, a2, 1365
-; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
-; RV64M-NEXT: lui a1, 61681
-; RV64M-NEXT: addi a1, a1, -241
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addi a1, a1, 257
-; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srliw a0, a0, 24
-; RV64M-NEXT: ret
-; RV64M-NEXT: .LBB10_2:
-; RV64M-NEXT: li a0, 32
-; RV64M-NEXT: ret
-;
; RV32ZBB-LABEL: test_ctlz_i32:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: clz a0, a0
@@ -1354,19 +1313,16 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: test_ctlz_i64:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -32
-; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s0, a1
-; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: srli a0, a1, 1
-; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi a4, a2, 1365
+; RV32I-NEXT: lui a2, 209715
+; RV32I-NEXT: addi a3, a2, 819
+; RV32I-NEXT: lui a2, 61681
+; RV32I-NEXT: addi a2, a2, -241
+; RV32I-NEXT: bnez a1, .LBB11_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
@@ -1377,28 +1333,28 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: not a0, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: lui a1, 209715
-; RV32I-NEXT: addi s5, a1, 819
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, a3
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lui a1, 61681
-; RV32I-NEXT: addi s6, a1, -241
-; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi s3, a1, 257
-; RV32I-NEXT: mv a1, s3
-; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: srli a0, s2, 1
-; RV32I-NEXT: or a0, s2, a0
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: slli a1, a0, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: srli a0, a0, 24
+; RV32I-NEXT: addi a0, a0, 32
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: srli a0, a1, 1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
@@ -1409,43 +1365,29 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: not a0, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: and a1, a0, a3
; RV32I-NEXT: srli a0, a0, 2
-; RV32I-NEXT: and a0, a0, s5
+; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: add a0, a1, a0
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: and a0, a0, s6
-; RV32I-NEXT: mv a1, s3
-; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s0, .LBB11_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: slli a1, a0, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: addi a0, a0, 32
-; RV32I-NEXT: j .LBB11_3
-; RV32I-NEXT: .LBB11_2:
-; RV32I-NEXT: srli a0, s1, 24
-; RV32I-NEXT: .LBB11_3:
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: test_ctlz_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: beqz a0, .LBB11_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: addi sp, sp, -16
-; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: srli a1, a0, 2
@@ -1481,14 +1423,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV64I-NEXT: slli a2, a1, 32
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: slli a2, a1, 32
-; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: call __muldi3
+; RV64I-NEXT: slli a1, a0, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a1, a0, a1
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 16
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB11_2:
; RV64I-NEXT: li a0, 64
@@ -1831,8 +1780,6 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV32I-LABEL: test_ctlz_i32_zero_undef:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -1860,52 +1807,54 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
; RV32I-NEXT: and a0, a0, a1
-; RV32I-NEXT: lui a1, 4112
-; RV32I-NEXT: addi a1, a1, 257
-; RV32I-NEXT: call __mulsi3
+; RV32I-NEXT: slli a1, a0, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a1, a0, a1
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
-; RV64I-LABEL: test_ctlz_i32_zero_undef:
-; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -16
-; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srliw a1, a0, 1
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 4
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 16
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: not a0, a0
-; RV64I-NEXT: srli a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addiw a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sub a0, a0, a1
-; RV64I-NEXT: lui a1, 209715
-; RV64I-NEXT: addiw a1, a1, 819
-; RV64I-NEXT: and a2, a0, a1
-; RV64I-NEXT: srli a0, a0, 2
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: add a0, a2, a0
-; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: add a0, a0, a1
-; RV64I-NEXT: lui a1, 61681
-; RV64I-NEXT: addiw a1, a1, -241
-; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 16
-; RV64I-NEXT: ret
+; RV64NOZBB-LABEL: test_ctlz_i32_zero_undef:
+; RV64NOZBB: # %bb.0:
+; RV64NOZBB-NEXT: srliw a1, a0, 1
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 2
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 4
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 8
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: srliw a1, a0, 16
+; RV64NOZBB-NEXT: or a0, a0, a1
+; RV64NOZBB-NEXT: not a0, a0
+; RV64NOZBB-NEXT: srli a1, a0, 1
+; RV64NOZBB-NEXT: lui a2, 349525
+; RV64NOZBB-NEXT: addiw a2, a2, 1365
+; RV64NOZBB-NEXT: and a1, a1, a2
+; RV64NOZBB-NEXT: sub a0, a0, a1
+; RV64NOZBB-NEXT: lui a1, 209715
+; RV64NOZBB-NEXT: addiw a1, a1, 819
+; RV64NOZBB-NEXT: and a2, a0, a1
+; RV64NOZBB-NEXT: srli a0, a0, 2
+; RV64NOZBB-NEXT: and a0, a0, a1
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: srli a1, a0, 4
+; RV64NOZBB-NEXT: add a0, a0, a1
+; RV64NOZBB-NEXT: lui a1, 61681
+; RV64NOZBB-NEXT: addi a2, a1, -241
+; RV64NOZBB-NEXT: and a2, a0, a2
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: addi a1, a1, -256
+; RV64NOZBB-NEXT: and a0, a0, a1
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: slli a0, a0, 8
+; RV64NOZBB-NEXT: add a0, a2, a0
+; RV64NOZBB-NEXT: srliw a0, a0, 24
+; RV64NOZBB-NEXT: ret
;
; RV32M-LABEL: test_ctlz_i32_zero_undef:
; RV32M: # %bb.0:
@@ -1942,41 +1891,6 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
; RV32M-NEXT: srli a0, a0, 24
; RV32M-NEXT: ret
;
-; RV64M-LABEL: test_ctlz_i32_zero_undef:
-; RV64M: # %bb.0:
-; RV64M-NEXT: srliw a1, a0, 1
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 2
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 4
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 8
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: srliw a1, a0, 16
-; RV64M-NEXT: or a0, a0, a1
-; RV64M-NEXT: not a0, a0
-; RV64M-NEXT: srli a1, a0, 1
-; RV64M-NEXT: lui a2, 349525
-; RV64M-NEXT: addiw a2, a2, 1365
-; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: lui a1, 209715
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: and a2, a0, a1
-; RV64M-NEXT: srli a0, a0, 2
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: add a0, a2, a0
-; RV64M-NEXT: srli a1, a0, 4
-; RV64M-NEXT: add a0, a0, a1
-; RV64M-NEXT: lui a1, 61681
-; RV64M-NEXT: addi a1, a1, -241
-; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 4112
-; RV64M-NEXT: addi a1, a1, 257
-; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: srliw a0, a0, 24
-; RV64M-NEXT: ret
-;
; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: clz a0, a0
@@ -2005,19 +1919,16 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
; RV32I-LABEL: test_ctlz_i64_zero_undef:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -32
-; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s0, a1
-; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: srli a0, a1, 1
-; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: lui a2, 349525
+; RV32I-NEXT: addi a4, a2, 1365
+; RV32I-NEXT: lui a2, 209715
+; RV32I-NEXT: addi a3, a2, 819
+; RV32I-NEXT: lui a2, 61681
+; RV32I-NEXT: addi a2, a2, -241
+; RV32I-NEXT: bnez a1, .LBB15_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 4
@@ -2028,28 +1939,28 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: not a0, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: lui a2, 349525
-; RV32I-NEXT: addi s4, a2, 1365
-; RV32I-NEXT: and a1, a1, s4
+; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: sub a...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
✅ With the latest revision this PR passed the Python code formatter. |
You can do this with fewer iterations by doubling the shift amount each time, e.g. for 64 bits:
|
Or
But I think either would work. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also make the mirror GlobalISel change?
case TargetOpcode::G_CTPOP: { |
2e2cbc8
to
aabc088
Compare
Sure! I have added it! But I'm not familiar with GISel, please double check it! |
…::VP_CTPOP We can avoid libcalls. Fixes llvm#86205
9400323
to
02baa6c
Compare
auto Shl = B.buildShl(Ty, ResTmp, ShiftC); | ||
ResTmp = B.buildAdd(Ty, ResTmp, Shl); | ||
} | ||
B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is assuming the result type is the same as the source type, which is not the case. CTPOP has 2 type indices, the result type may differ from the source
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see this is an existing bug, so I suppose it's best to leave as-is and fix separately
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not familiar with GlobalISel, so I will leave it to you guys to fix it. :-)
}; | ||
if (IsMulSupported(Ty)) { | ||
auto ResTmp = B.buildMul(Ty, B8Count, MulMask); | ||
B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here
We can avoid libcalls.
Fixes #86205