[RISCV] Use bset+addi for (not (sll -1, X)). #72549

topperc · 2023-11-16T18:20:53Z

This is an alternative to #71420 that handles i32 on RV64 safely by pre-promoting the pattern in DAG combine.

We can use (ADDI (BSET X0, X), -1).

This is an alternative to llvm#71420 that handles i32 on RV64 safely by pre-promoting the pattern in DAG combine.

llvmbot · 2023-11-16T18:21:22Z

@llvm/pr-subscribers-backend-risc-v

Author: Craig Topper (topperc)

Changes

This is an alternative to #71420 that handles i32 on RV64 safely by pre-promoting the pattern in DAG combine.

Full diff: https://github.com/llvm/llvm-project/pull/72549.diff

4 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+15)
(modified) llvm/lib/Target/RISCV/RISCVInstrInfoZb.td (+2)
(modified) llvm/test/CodeGen/RISCV/rv32zbs.ll (+93)
(modified) llvm/test/CodeGen/RISCV/rv64zbs.ll (+75)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fc0f59a09315b76..f89f300a4e9e50c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12362,6 +12362,21 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Pre-promote (i32 (xor (shl -1, X), ~0)) on RV64 with Zbs so we can use
+  // (ADDI (BSET X0, X), -1). If we wait until/ type legalization, we'll create
+  // RISCVISD:::SLLW and we can't recover it to use a BSET instruction.
+  if (!RV64LegalI32 && Subtarget.is64Bit() && Subtarget.hasStdExtZbs() &&
+      N->getValueType(0) == MVT::i32 && isAllOnesConstant(N1) &&
+      N0.getOpcode() == ISD::SHL && isAllOnesConstant(N0.getOperand(0)) &&
+      !isa<ConstantSDNode>(N0.getOperand(1)) && N0.hasOneUse()) {
+    SDLoc DL(N);
+    SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N0.getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N0.getOperand(1));
+    SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i64, Op0, Op1);
+    SDValue And = DAG.getNOT(DL, Shl, MVT::i64);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
+  }
+
   // fold (xor (sllw 1, x), -1) -> (rolw ~1, x)
   // NOTE: Assumes ROL being legal means ROLW is legal.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index d1c29842b85d38a..f7f8560b57b5c2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -554,6 +554,8 @@ def : Pat<(XLenVT (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
 
 def : Pat<(XLenVT (shiftop<shl> 1, (XLenVT GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
+def : Pat<(XLenVT (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
+          (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
 
 def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)),
           (BCLRI GPR:$rs1, BCLRMask:$mask)>;
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index 460d15991788238..ccda8f4e5dd0595 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -744,3 +744,96 @@ define i32 @or_i32_66901(i32 %a) nounwind {
   %or = or i32 %a, 66901
   ret i32 %or
 }
+
+define i32 @bset_trailing_ones_i32_mask(i32 %a) nounwind {
+; RV32I-LABEL: bset_trailing_ones_i32_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, -1
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBS-LABEL: bset_trailing_ones_i32_mask:
+; RV32ZBS:       # %bb.0:
+; RV32ZBS-NEXT:    bset a0, zero, a0
+; RV32ZBS-NEXT:    addi a0, a0, -1
+; RV32ZBS-NEXT:    ret
+  %and = and i32 %a, 31
+  %shift = shl nsw i32 -1, %and
+  %not = xor i32 %shift, -1
+  ret i32 %not
+}
+
+define i32 @bset_trailing_ones_i32_no_mask(i32 %a) nounwind {
+; RV32I-LABEL: bset_trailing_ones_i32_no_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, -1
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBS-LABEL: bset_trailing_ones_i32_no_mask:
+; RV32ZBS:       # %bb.0:
+; RV32ZBS-NEXT:    bset a0, zero, a0
+; RV32ZBS-NEXT:    addi a0, a0, -1
+; RV32ZBS-NEXT:    ret
+  %shift = shl nsw i32 -1, %a
+  %not = xor i32 %shift, -1
+  ret i32 %not
+}
+
+define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
+; CHECK-LABEL: bset_trailing_ones_i64_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    andi a3, a0, 63
+; CHECK-NEXT:    addi a1, a3, -32
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    bltz a1, .LBB43_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sll a2, a2, a3
+; CHECK-NEXT:    j .LBB43_3
+; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:    not a2, a3
+; CHECK-NEXT:    lui a3, 524288
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    srl a2, a3, a2
+; CHECK-NEXT:    or a2, a0, a2
+; CHECK-NEXT:  .LBB43_3:
+; CHECK-NEXT:    srai a1, a1, 31
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    not a1, a2
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    ret
+  %and = and i64 %a, 63
+  %shift = shl nsw i64 -1, %and
+  %not = xor i64 %shift, -1
+  ret i64 %not
+}
+
+define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
+; CHECK-LABEL: bset_trailing_ones_i64_no_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    addi a2, a0, -32
+; CHECK-NEXT:    sll a1, a1, a0
+; CHECK-NEXT:    bltz a2, .LBB44_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    j .LBB44_3
+; CHECK-NEXT:  .LBB44_2:
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    lui a3, 524288
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    srl a0, a3, a0
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:  .LBB44_3:
+; CHECK-NEXT:    srai a2, a2, 31
+; CHECK-NEXT:    and a2, a2, a1
+; CHECK-NEXT:    not a1, a0
+; CHECK-NEXT:    not a0, a2
+; CHECK-NEXT:    ret
+  %shift = shl nsw i64 -1, %a
+  %not = xor i64 %shift, -1
+  ret i64 %not
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b30b3c15196076b..016b0924eaf1993 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -1071,3 +1071,78 @@ define i64 @or_i64_66901(i64 %a) nounwind {
   %or = or i64 %a, 66901
   ret i64 %or
 }
+
+define signext i32 @bset_trailing_ones_i32_mask(i32 signext %a) nounwind {
+; RV64I-LABEL: bset_trailing_ones_i32_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_trailing_ones_i32_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    andi a0, a0, 31
+; RV64ZBS-NEXT:    bset a0, zero, a0
+; RV64ZBS-NEXT:    addi a0, a0, -1
+; RV64ZBS-NEXT:    ret
+  %and = and i32 %a, 31
+  %shift = shl nsw i32 -1, %and
+  %not = xor i32 %shift, -1
+  ret i32 %not
+}
+
+define signext i32 @bset_trailing_ones_i32_no_mask(i32 signext %a) nounwind {
+; RV64I-LABEL: bset_trailing_ones_i32_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_trailing_ones_i32_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, zero, a0
+; RV64ZBS-NEXT:    addiw a0, a0, -1
+; RV64ZBS-NEXT:    ret
+  %shift = shl nsw i32 -1, %a
+  %not = xor i32 %shift, -1
+  ret i32 %not
+}
+
+define signext i64 @bset_trailing_ones_i64_mask(i64 signext %a) nounwind {
+; RV64I-LABEL: bset_trailing_ones_i64_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sll a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_trailing_ones_i64_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, zero, a0
+; RV64ZBS-NEXT:    addi a0, a0, -1
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, 63
+  %shift = shl nsw i64 -1, %and
+  %not = xor i64 %shift, -1
+  ret i64 %not
+}
+
+define signext i64 @bset_trailing_ones_i64_no_mask(i64 signext %a) nounwind {
+; RV64I-LABEL: bset_trailing_ones_i64_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sll a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_trailing_ones_i64_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, zero, a0
+; RV64ZBS-NEXT:    addi a0, a0, -1
+; RV64ZBS-NEXT:    ret
+  %shift = shl nsw i64 -1, %a
+  %not = xor i64 %shift, -1
+  ret i64 %not
+}

dtcxzyw · 2023-11-16T18:33:21Z

BSET is expensive on both SiFive-7 (Latency = 3) and XiangShan-NanHu (Latency = 3).

rtayl · 2023-11-16T18:33:56Z

LGTM.

dtcxzyw · 2023-11-16T18:45:30Z

BSET is expensive on both SiFive-7 (Latency = 3) and XiangShan-NanHu (Latency = 3).

SiFive:

llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td

Lines 295 to 303 in 186db1b

    
           // Single-bit instructions 
        
           // BEXT[I] instruction is available on all ALUs and the other instructions 
        
           // are only available on the SiFive7B pipe. 
        
           let Latency = 3 in { 
        
           def : WriteRes<WriteSingleBit, [SiFive7PipeB]>; 
        
           def : WriteRes<WriteSingleBitImm, [SiFive7PipeB]>; 
        
           def : WriteRes<WriteBEXT, [SiFive7PipeAB]>; 
        
           def : WriteRes<WriteBEXTI, [SiFive7PipeAB]>; 
        
           }

XiangShan-NanHu:
https://github.com/llvm/llvm-project/blob/b5a1feb9a374bd395795b26abd15ed9840a64339/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td#L96-L116

topperc · 2023-11-16T18:52:07Z

BSET is expensive on both SiFive-7 (Latency = 3) and XiangShan-NanHu (Latency = 3).

SiFive:

llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td

Lines 295 to 303 in 186db1b

// Single-bit instructions

// BEXT[I] instruction is available on all ALUs and the other instructions

// are only available on the SiFive7B pipe.

let Latency = 3 in {

def : WriteRes<WriteSingleBit, [SiFive7PipeB]>;

def : WriteRes<WriteSingleBitImm, [SiFive7PipeB]>;

def : WriteRes<WriteBEXT, [SiFive7PipeAB]>;

def : WriteRes<WriteBEXTI, [SiFive7PipeAB]>;

}

The latency for shift also says 3 on SiFive7. SiFive7 has an early ALU in the first stage of the execution pipeline and a late ALU in the third stage. The latency is set based on the total pipeline length. If the operands aren't ready at the beginning the instruction will not execute in the early ALU and will wait until the late ALU. There should be a ReadAdvance to subtract 2 cycles off the latency.

topperc · 2023-11-16T19:32:45Z

Committed as 4eaf986 and 927f6f1.

I left a question in the review for RISCVSchedXiangShanNanHu.td. 3 cycle latency seems high to me and if accurate makes the instructions unattractive to use.

topperc added 2 commits November 16, 2023 10:01

[RISCV] Add test cases for (not (sll -1, X)) for Zbs. NFC

93bf2ac

We can use (ADDI (BSET X0, X), -1).

[RISCV] Use bset+addi for (not (sll -1, X)).

c8cebfc

This is an alternative to llvm#71420 that handles i32 on RV64 safely by pre-promoting the pattern in DAG combine.

topperc requested review from asb, preames and rtayl November 16, 2023 18:20

llvmbot added the backend:RISC-V label Nov 16, 2023

topperc closed this Nov 16, 2023

topperc deleted the pr/bset branch November 16, 2023 19:32

rtayl mentioned this pull request Nov 16, 2023

[RISCV] Add bset optimization for left shift code #71420

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[RISCV] Use bset+addi for (not (sll -1, X)). #72549

[RISCV] Use bset+addi for (not (sll -1, X)). #72549

topperc commented Nov 16, 2023

llvmbot commented Nov 16, 2023

dtcxzyw commented Nov 16, 2023

rtayl commented Nov 16, 2023

dtcxzyw commented Nov 16, 2023

topperc commented Nov 16, 2023

topperc commented Nov 16, 2023

[RISCV] Use bset+addi for (not (sll -1, X)). #72549

[RISCV] Use bset+addi for (not (sll -1, X)). #72549

Conversation

topperc commented Nov 16, 2023

llvmbot commented Nov 16, 2023

dtcxzyw commented Nov 16, 2023

rtayl commented Nov 16, 2023

dtcxzyw commented Nov 16, 2023

topperc commented Nov 16, 2023

topperc commented Nov 16, 2023