Skip to content

Commit

Permalink
[TargetLowering] Expand the last stage of i16 popcnt using shift+add+…
Browse files Browse the repository at this point in the history
…and instead of mul+shift.

If we use multiply it would be with 0x0101 which is 1 more than a power
of 2. On some targets we would expand this to shl+add. By avoiding the
multiply earlier, we can generate better code.

Note, PowerPC doesn't do the shl+add expansion of multiply so one of
the tests increased in instruction count.

Limiting to scalars because it almost always increased the number of
instructions in vector tests.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D125638
  • Loading branch information
topperc committed May 16, 2022
1 parent e6fc845 commit 1c4880a
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 89 deletions.
12 changes: 12 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Expand Up @@ -7716,6 +7716,18 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (Len <= 8)
return Op;

// Avoid the multiply if we only have 2 bytes to add.
// TODO: Only doing this for scalars because vectors weren't as obviously
// improved.
if (Len == 16 && !VT.isVector()) {
// v = (v + (v >> 8)) & 0x00FF;
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::ADD, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(8, dl, ShVT))),
DAG.getConstant(0xFF, dl, VT));
}

// v = (v * 0x01010101...) >> (Len - 8)
SDValue Mask01 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
Expand Down
19 changes: 10 additions & 9 deletions llvm/test/CodeGen/PowerPC/popcnt-zext.ll
Expand Up @@ -23,9 +23,9 @@ define i16 @zpop_i8_i16(i8 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: blr
%z = zext i8 %x to i16
%pop = tail call i16 @llvm.ctpop.i16(i16 %z)
Expand Down Expand Up @@ -172,9 +172,10 @@ define i32 @popz_i16_32(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: clrldi 3, 3, 32
; SLOW-NEXT: blr
%pop = tail call i16 @llvm.ctpop.i16(i16 %x)
%z = zext i16 %pop to i32
Expand Down Expand Up @@ -276,9 +277,9 @@ define i64 @popa_i16_i64(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: srwi 3, 3, 8
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: rlwinm 3, 3, 0, 27, 27
; SLOW-NEXT: blr
%pop = call i16 @llvm.ctpop.i16(i16 %x)
Expand Down
110 changes: 40 additions & 70 deletions llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
Expand Up @@ -110,13 +110,10 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB1_2:
; RV32_NOZBB-NEXT: li a0, 16
Expand All @@ -143,14 +140,11 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB1_2:
; RV64NOZBB-NEXT: li a0, 16
Expand Down Expand Up @@ -606,13 +600,10 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
Expand All @@ -632,14 +623,11 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
Expand Down Expand Up @@ -1096,13 +1084,10 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB9_2:
; RV32_NOZBB-NEXT: li a0, 16
Expand Down Expand Up @@ -1138,14 +1123,11 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB9_2:
; RV64NOZBB-NEXT: li a0, 16
Expand Down Expand Up @@ -1713,13 +1695,10 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
Expand Down Expand Up @@ -1749,14 +1728,11 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
Expand Down Expand Up @@ -2251,13 +2227,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctpop_i16:
Expand All @@ -2274,14 +2247,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i16:
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/parity-vec.ll
Expand Up @@ -64,9 +64,8 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
; NOPOPCNT-NEXT: addl %eax, %ecx
; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F
; NOPOPCNT-NEXT: movl %ecx, %eax
; NOPOPCNT-NEXT: shll $8, %eax
; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: shrl $8, %eax
; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax
; NOPOPCNT-NEXT: retq
;
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/popcnt.ll
Expand Up @@ -77,9 +77,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
Expand All @@ -99,9 +99,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shll $8, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
Expand Down Expand Up @@ -1540,9 +1540,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
;
; X64-LABEL: popcount_i16_zext:
Expand All @@ -1561,9 +1561,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shll $8, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: retq
;
; X86-POPCNT-LABEL: popcount_i16_zext:
Expand Down

0 comments on commit 1c4880a

Please sign in to comment.