[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505

wangpc-pp · 2024-03-25T13:50:32Z

We can avoid libcalls.

Fixes #86205

llvmbot · 2024-03-25T13:51:01Z

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-loongarch

@llvm/pr-subscribers-llvm-selectiondag

Author: Wang Pengcheng (wangpc-pp)

Changes

We can avoid libcalls.

Fixes #86205

Patch is 110.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86505.diff

11 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+15-5)
(modified) llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll (+6-3)
(modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+353-485)
(modified) llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll (+52-62)
(modified) llvm/test/CodeGen/RISCV/rv32xtheadbb.ll (+43-59)
(modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+211-260)
(modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll (+14-9)
(modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll (+28-18)
(modified) llvm/test/CodeGen/RISCV/rv64xtheadbb.ll (+120-117)
(modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+255-252)
(modified) llvm/test/CodeGen/RISCV/sextw-removal.ll (+9-4)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8be03b66e155f6..e0662d57bb4bba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8709,11 +8709,21 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
                      DAG.getConstant(0xFF, dl, VT));
   }
 
-  // v = (v * 0x01010101...) >> (Len - 8)
-  SDValue Mask01 =
-      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
-  return DAG.getNode(ISD::SRL, dl, VT,
-                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+  SDValue V;
+  if (isOperationLegalOrCustomOrPromote(ISD::MUL, VT)) {
+    // v = (v * 0x01010101...) >> (Len - 8)
+    SDValue Mask01 =
+        DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+    V = DAG.getNode(ISD::MUL, dl, VT, Op, Mask01);
+  } else {
+    V = Op;
+    SDValue ShiftC = DAG.getConstant(8, dl, VT);
+    for (unsigned I = 8; I < Len; I += 8) {
+      V = DAG.getNode(ISD::ADD, dl, VT, Op,
+                      DAG.getNode(ISD::SHL, dl, VT, V, ShiftC));
+    }
+  }
+  return DAG.getNode(ISD::SRL, dl, VT, V,
                      DAG.getConstant(Len - 8, dl, ShVT));
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index 9fa3f5076bb221..c3731fc4f2e19f 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -285,9 +285,12 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; LA64-NEXT:    lu12i.w $a1, 61680
 ; LA64-NEXT:    ori $a1, $a1, 3855
 ; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 4112
-; LA64-NEXT:    ori $a1, $a1, 257
-; LA64-NEXT:    mul.d $a0, $a0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 8
+; LA64-NEXT:    add.d $a1, $a0, $a1
+; LA64-NEXT:    slli.d $a1, $a1, 8
+; LA64-NEXT:    add.d $a1, $a0, $a1
+; LA64-NEXT:    slli.d $a1, $a1, 8
+; LA64-NEXT:    add.d $a0, $a0, $a1
 ; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 24
 ; LA64-NEXT:    ret
   %1 = call i32 @llvm.ctpop.i32(i32 %a)
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b396..8533a1d73544f3 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -1160,8 +1160,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    beqz a0, .LBB10_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -1189,61 +1187,63 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
 ; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB10_2:
 ; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_ctlz_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a1, a0
-; RV64I-NEXT:    beqz a1, .LBB10_2
-; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    ret
+; RV64NOZBB-LABEL: test_ctlz_i32:
+; RV64NOZBB:       # %bb.0:
+; RV64NOZBB-NEXT:    sext.w a1, a0
+; RV64NOZBB-NEXT:    beqz a1, .LBB10_2
+; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
+; RV64NOZBB-NEXT:    srliw a1, a0, 1
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 2
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 4
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 8
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 16
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    not a0, a0
+; RV64NOZBB-NEXT:    srli a1, a0, 1
+; RV64NOZBB-NEXT:    lui a2, 349525
+; RV64NOZBB-NEXT:    addiw a2, a2, 1365
+; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    sub a0, a0, a1
+; RV64NOZBB-NEXT:    lui a1, 209715
+; RV64NOZBB-NEXT:    addiw a1, a1, 819
+; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    srli a0, a0, 2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    srli a1, a0, 4
+; RV64NOZBB-NEXT:    add a0, a0, a1
+; RV64NOZBB-NEXT:    lui a1, 61681
+; RV64NOZBB-NEXT:    addi a2, a1, -241
+; RV64NOZBB-NEXT:    and a2, a0, a2
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    addi a1, a1, -256
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    srliw a0, a0, 24
+; RV64NOZBB-NEXT:    ret
+; RV64NOZBB-NEXT:  .LBB10_2:
+; RV64NOZBB-NEXT:    li a0, 32
+; RV64NOZBB-NEXT:    ret
 ;
 ; RV32M-LABEL: test_ctlz_i32:
 ; RV32M:       # %bb.0:
@@ -1285,47 +1285,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32M-NEXT:    li a0, 32
 ; RV32M-NEXT:    ret
 ;
-; RV64M-LABEL: test_ctlz_i32:
-; RV64M:       # %bb.0:
-; RV64M-NEXT:    sext.w a1, a0
-; RV64M-NEXT:    beqz a1, .LBB10_2
-; RV64M-NEXT:  # %bb.1: # %cond.false
-; RV64M-NEXT:    srliw a1, a0, 1
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addi a1, a1, 257
-; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srliw a0, a0, 24
-; RV64M-NEXT:    ret
-; RV64M-NEXT:  .LBB10_2:
-; RV64M-NEXT:    li a0, 32
-; RV64M-NEXT:    ret
-;
 ; RV32ZBB-LABEL: test_ctlz_i32:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    clz a0, a0
@@ -1354,19 +1313,16 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    srli a0, a1, 1
-; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a4, a2, 1365
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a3, a2, 819
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    bnez a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
@@ -1377,28 +1333,28 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi s4, a2, 1365
-; RV32I-NEXT:    and a1, a1, s4
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi s5, a1, 819
-; RV32I-NEXT:    and a1, a0, s5
+; RV32I-NEXT:    and a1, a0, a3
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, s5
+; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi s6, a1, -241
-; RV32I-NEXT:    and a0, a0, s6
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi s3, a1, 257
-; RV32I-NEXT:    mv a1, s3
-; RV32I-NEXT:    call __mulsi3
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    srli a0, s2, 1
-; RV32I-NEXT:    or a0, s2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    srli a0, a1, 1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
@@ -1409,43 +1365,29 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    and a1, a1, s4
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    and a1, a0, s5
+; RV32I-NEXT:    and a1, a0, a3
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, s5
+; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    and a0, a0, s6
-; RV32I-NEXT:    mv a1, s3
-; RV32I-NEXT:    call __mulsi3
-; RV32I-NEXT:    bnez s0, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:    j .LBB11_3
-; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    srli a0, s1, 24
-; RV32I-NEXT:  .LBB11_3:
 ; RV32I-NEXT:    li a1, 0
-; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ctlz_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    beqz a0, .LBB11_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 2
@@ -1481,14 +1423,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB11_2:
 ; RV64I-NEXT:    li a0, 64
@@ -1831,8 +1780,6 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i32_zero_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -1860,52 +1807,54 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
 ; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test_ctlz_i32_zero_undef:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
+; RV64NOZBB-LABEL: test_ctlz_i32_zero_undef:
+; RV64NOZBB:       # %bb.0:
+; RV64NOZBB-NEXT:    srliw a1, a0, 1
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 2
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 4
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 8
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    srliw a1, a0, 16
+; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    not a0, a0
+; RV64NOZBB-NEXT:    srli a1, a0, 1
+; RV64NOZBB-NEXT:    lui a2, 349525
+; RV64NOZBB-NEXT:    addiw a2, a2, 1365
+; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    sub a0, a0, a1
+; RV64NOZBB-NEXT:    lui a1, 209715
+; RV64NOZBB-NEXT:    addiw a1, a1, 819
+; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    srli a0, a0, 2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    srli a1, a0, 4
+; RV64NOZBB-NEXT:    add a0, a0, a1
+; RV64NOZBB-NEXT:    lui a1, 61681
+; RV64NOZBB-NEXT:    addi a2, a1, -241
+; RV64NOZBB-NEXT:    and a2, a0, a2
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    addi a1, a1, -256
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    slli a0, a0, 8
+; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    srliw a0, a0, 24
+; RV64NOZBB-NEXT:    ret
 ;
 ; RV32M-LABEL: test_ctlz_i32_zero_undef:
 ; RV32M:       # %bb.0:
@@ -1942,41 +1891,6 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    ret
 ;
-; RV64M-LABEL: test_ctlz_i32_zero_undef:
-; RV64M:       # %bb.0:
-; RV64M-NEXT:    srliw a1, a0, 1
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addi a1, a1, 257
-; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srliw a0, a0, 24
-; RV64M-NEXT:    ret
-;
 ; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    clz a0, a0
@@ -2005,19 +1919,16 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64_zero_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    srli a0, a1, 1
-; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a4, a2, 1365
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a3, a2, 819
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    bnez a1, .LBB15_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
@@ -2028,28 +1939,28 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi s4, a2, 1365
-; RV32I-NEXT:    and a1, a1, s4
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    sub a...
[truncated]

github-actions · 2024-03-25T13:53:27Z

✅ With the latest revision this PR passed the C/C++ code formatter.

github-actions · 2024-03-25T13:53:27Z

✅ With the latest revision this PR passed the Python code formatter.

llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll

llvm/test/CodeGen/RISCV/sextw-removal.ll

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

jayfoad · 2024-03-25T14:28:38Z

You can do this with fewer iterations by doubling the shift amount each time, e.g. for 64 bits:

v += v << 8;
v += v << 16;
v += v << 32;
return v >> 56;

topperc · 2024-03-25T18:41:07Z

You can do this with fewer iterations by doubling the shift amount each time, e.g. for 64 bits:
v += v << 8;
v += v << 16;
v += v << 32;
return v >> 56;

Or

v += v << 32
v += v << 16
v += v << 8
return v >> 56

But I think either would work.

arsenm

Can you also make the mirror GlobalISel change?

llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Line 6313 in 5c663aa

case TargetOpcode::G_CTPOP: {

wangpc-pp · 2024-03-26T08:25:27Z

Can you also make the mirror GlobalISel change?

llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Line 6313 in 5c663aa

case TargetOpcode::G_CTPOP: {

Sure! I have added it! But I'm not familiar with GISel, please double check it!

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir

…::VP_CTPOP We can avoid libcalls. Fixes llvm#86205

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

arsenm · 2024-03-29T05:55:03Z

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+        auto Shl = B.buildShl(Ty, ResTmp, ShiftC);
+        ResTmp = B.buildAdd(Ty, ResTmp, Shl);
+      }
+      B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);


This is assuming the result type is the same as the source type, which is not the case. CTPOP has 2 type indices, the result type may differ from the source

I see this is an existing bug, so I suppose it's best to leave as-is and fix separately

I'm not familiar with GlobalISel, so I will leave it to you guys to fix it. :-)

arsenm · 2024-03-29T05:55:15Z

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+    };
+    if (IsMulSupported(Ty)) {
+      auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
+      B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);


llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

llvmbot added backend:loongarch llvm:SelectionDAG SelectionDAGISel as well labels Mar 25, 2024

wangpc-pp requested review from topperc, dtcxzyw and efriedma-quic March 25, 2024 13:51

wangpc-pp commented Mar 25, 2024

View reviewed changes

llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/RISCV/sextw-removal.ll Outdated Show resolved Hide resolved

RKSimon reviewed Mar 25, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Show resolved Hide resolved

arsenm reviewed Mar 26, 2024

View reviewed changes

wangpc-pp force-pushed the main-popcount-without-mul branch from 2e2cbc8 to aabc088 Compare March 26, 2024 07:42

llvmbot added backend:AArch64 llvm:globalisel labels Mar 26, 2024

arsenm reviewed Mar 26, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

RKSimon added a commit that referenced this pull request Mar 26, 2024

[Mips] ctpop.mir - regenerate checks to improve codegen diff in #86505

5b544b5

RKSimon reviewed Mar 26, 2024

View reviewed changes

llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir Outdated Show resolved Hide resolved

wangpc-pp added 2 commits March 26, 2024 18:55

[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP/ISD…

888726d

…::VP_CTPOP We can avoid libcalls. Fixes llvm#86205

Add GISel support

02baa6c

wangpc-pp force-pushed the main-popcount-without-mul branch from 9400323 to 02baa6c Compare March 26, 2024 10:56

wangpc-pp added 2 commits March 28, 2024 20:34

Use getTypeToTransformTo

d041aea

Add custom IsMulSupported

e882368

arsenm reviewed Mar 28, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp Outdated Show resolved Hide resolved

Use one type index

66e4b9f

arsenm reviewed Mar 29, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Outdated Show resolved Hide resolved

wangpc-pp added 2 commits March 29, 2024 14:18

Use an immediate variable of shift count

1bd62e4

Clean includes

5087f7a

arsenm approved these changes Mar 29, 2024

View reviewed changes

wangpc-pp merged commit 610b9e2 into llvm:main Mar 29, 2024
4 checks passed

wangpc-pp deleted the main-popcount-without-mul branch March 29, 2024 07:39

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505

[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505

wangpc-pp commented Mar 25, 2024

llvmbot commented Mar 25, 2024 •

edited

github-actions bot commented Mar 25, 2024 •

edited

github-actions bot commented Mar 25, 2024

jayfoad commented Mar 25, 2024

topperc commented Mar 25, 2024

arsenm left a comment

wangpc-pp commented Mar 26, 2024

arsenm Mar 29, 2024

arsenm Mar 29, 2024

wangpc-pp Mar 29, 2024

arsenm Mar 29, 2024

[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505

[SDAG] Use shifts if ISD::MUL is illegal when lowering ISD::CTPOP #86505

Conversation

wangpc-pp commented Mar 25, 2024

llvmbot commented Mar 25, 2024 • edited

github-actions bot commented Mar 25, 2024 • edited

github-actions bot commented Mar 25, 2024

jayfoad commented Mar 25, 2024

topperc commented Mar 25, 2024

arsenm left a comment

Choose a reason for hiding this comment

wangpc-pp commented Mar 26, 2024

arsenm Mar 29, 2024

Choose a reason for hiding this comment

arsenm Mar 29, 2024

Choose a reason for hiding this comment

wangpc-pp Mar 29, 2024

Choose a reason for hiding this comment

arsenm Mar 29, 2024

Choose a reason for hiding this comment

llvmbot commented Mar 25, 2024 •

edited

github-actions bot commented Mar 25, 2024 •

edited