[RISCV] Teach isel to select ADDW/SUBW/MULW/SLLIW when only the lower…

… 32-bits are used. We normally select these when the root node is a sext_inreg, but SimplifyDemandedBits can sometimes bypass the sext_inreg for some users. This can create situation where sext_inreg+add/sub/mul/shl is selected to a W instruction, and then the add/sub/mul/shl is separately selected to a non-W instruction with the same inputs. This patch tries to detect when it would still be ok to use a W instruction without the sext_inreg by checking the direct users. This can allow the W instruction to CSE with one created for a sext_inreg+add/sub/mul/shl. To minimize complexity and cost of checking, we make no attempt to determine if the CSE will happen and just always use a W instruction when we can. Differential Revision: https://reviews.llvm.org/D107658
llvm · Aug 18, 2021 · d9ba1a9 · d9ba1a9
1 parent 6cc1109
commit d9ba1a9
Show file tree

Hide file tree

Showing 33 changed files with 692 additions and 610 deletions.
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1496,6 +1496,88 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
   return false;
 }
 
+// Return true if all users of this SDNode* only consume the lower \p Bits.
+// This can be used to form W instructions for add/sub/mul/shl even when the
+// root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if
+// SimplifyDemandedBits has made it so some users see a sext_inreg and some
+// don't. The sext_inreg+add/sub/mul/shl will get selected, but still leave
+// the add/sub/mul/shl to become non-W instructions. By checking the users we
+// may be able to use a W instruction and CSE with the other instruction if
+// this has happened. We could try to detect that the CSE opportunity exists
+// before doing this, but that would be more complicated.
+// TODO: Does this need to look through AND/OR/XOR to their users to find more
+// opportunities.
+bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
+  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB ||
+          Node->getOpcode() == ISD::MUL || Node->getOpcode() == ISD::SHL) &&
+         "Unexpected opcode");
+
+  for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    // Users of this node should have already been instruction selected
+    if (!User->isMachineOpcode())
+      return false;
+
+    // TODO: Add more opcodes?
+    switch (User->getMachineOpcode()) {
+    default:
+      return false;
+    case RISCV::ADDW:
+    case RISCV::ADDIW:
+    case RISCV::SUBW:
+    case RISCV::MULW:
+    case RISCV::SLLW:
+    case RISCV::SLLIW:
+    case RISCV::SRAW:
+    case RISCV::SRAIW:
+    case RISCV::SRLW:
+    case RISCV::SRLIW:
+    case RISCV::DIVW:
+    case RISCV::DIVUW:
+    case RISCV::REMW:
+    case RISCV::REMUW:
+    case RISCV::ROLW:
+    case RISCV::RORW:
+    case RISCV::RORIW:
+    case RISCV::CLZW:
+    case RISCV::CTZW:
+    case RISCV::CPOPW:
+    case RISCV::SLLIUW:
+      if (Bits < 32)
+        return false;
+      break;
+    case RISCV::SLLI:
+      // SLLI only uses the lower (XLen - ShAmt) bits.
+      if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1))
+        return false;
+      break;
+    case RISCV::ADDUW:
+    case RISCV::SH1ADDUW:
+    case RISCV::SH2ADDUW:
+    case RISCV::SH3ADDUW:
+      // The first operand to add.uw/shXadd.uw is implicitly zero extended from
+      // 32 bits.
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    case RISCV::SB:
+      if (UI.getOperandNo() != 0 || Bits < 8)
+        return false;
+      break;
+    case RISCV::SH:
+      if (UI.getOperandNo() != 0 || Bits < 16)
+        return false;
+      break;
+    case RISCV::SW:
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    }
+  }
+
+  return true;
+}
+
 // Select VL as a 5 bit immediate or a value that will become a register. This
 // allows us to choose betwen VSETIVLI or VSETVLI later.
 bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -58,6 +58,9 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool selectSExti32(SDValue N, SDValue &Val);
   bool selectZExti32(SDValue N, SDValue &Val);
 
+  bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const;
+  bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); }
+
   bool selectVLOp(SDValue N, SDValue &VL);
 
   bool selectVSplat(SDValue N, SDValue &SplatVal);

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1254,6 +1254,14 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
           (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
+// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
+// if only the lower 32 bits of their result is used.
+class overflowingbinopw<SDPatternOperator operator>
+    : PatFrag<(ops node:$lhs, node:$rhs),
+              (operator node:$lhs, node:$rhs), [{
+  return hasAllWUsers(Node);
+}]>;
+
 let Predicates = [IsRV64] in {
 
 /// sext and zext
@@ -1283,6 +1291,13 @@ def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
 def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
 def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
 
+// Select W instructions without sext_inreg if only the lower 32 bits of the
+// result are used.
+def : PatGprGpr<overflowingbinopw<add>, ADDW>;
+def : PatGprSimm12<overflowingbinopw<add>, ADDIW>;
+def : PatGprGpr<overflowingbinopw<sub>, SUBW>;
+def : PatGprImm<overflowingbinopw<shl>, SLLIW, uimm5>;
+
 /// Loads
 
 defm : LdPat<sextloadi32, LW, i64>;

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -75,6 +75,10 @@ let Predicates = [HasStdExtM, IsRV64] in {
 def : Pat<(sext_inreg (mul GPR:$rs1, GPR:$rs2), i32),
           (MULW GPR:$rs1, GPR:$rs2)>;
 
+// Select W instructions without sext_inreg if only the lower 32-bits of the
+// result are used.
+def : PatGprGpr<overflowingbinopw<mul>, MULW>;
+
 def : PatGprGpr<riscv_divw, DIVW>;
 def : PatGprGpr<riscv_divuw, DIVUW>;
 def : PatGprGpr<riscv_remuw, REMUW>;

diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -21,7 +21,7 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
 ;
 ; RV64I-LABEL: add_small_const:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    jalr zero, 0(ra)
@@ -35,7 +35,7 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
 ;
 ; RV64C-LABEL: add_small_const:
 ; RV64C:       # %bb.0:
-; RV64C-NEXT:    c.addi a0, 1
+; RV64C-NEXT:    c.addiw a0, 1
 ; RV64C-NEXT:    c.slli a0, 56
 ; RV64C-NEXT:    c.srai a0, 56
 ; RV64C-NEXT:    c.jr ra
@@ -75,7 +75,7 @@ define signext i32 @add_large_const(i32 signext %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    c.lui a1, 1
 ; RV64C-NEXT:    c.addiw a1, -1
-; RV64C-NEXT:    c.add a0, a1
+; RV64C-NEXT:    c.addw a0, a1
 ; RV64C-NEXT:    c.slli a0, 48
 ; RV64C-NEXT:    c.srai a0, 48
 ; RV64C-NEXT:    c.jr ra
@@ -115,7 +115,7 @@ define signext i32 @add_huge_const(i32 signext %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    c.lui a1, 8
 ; RV64C-NEXT:    c.addiw a1, -1
-; RV64C-NEXT:    c.add a0, a1
+; RV64C-NEXT:    c.addw a0, a1
 ; RV64C-NEXT:    c.slli a0, 48
 ; RV64C-NEXT:    c.srai a0, 48
 ; RV64C-NEXT:    c.jr ra
@@ -135,7 +135,7 @@ define signext i24 @add_non_machine_type(i24 signext %a) nounwind {
 ;
 ; RV64I-LABEL: add_non_machine_type:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 256
+; RV64I-NEXT:    addiw a0, a0, 256
 ; RV64I-NEXT:    slli a0, a0, 52
 ; RV64I-NEXT:    srai a0, a0, 40
 ; RV64I-NEXT:    jalr zero, 0(ra)
@@ -149,7 +149,7 @@ define signext i24 @add_non_machine_type(i24 signext %a) nounwind {
 ;
 ; RV64C-LABEL: add_non_machine_type:
 ; RV64C:       # %bb.0:
-; RV64C-NEXT:    addi a0, a0, 256
+; RV64C-NEXT:    addiw a0, a0, 256
 ; RV64C-NEXT:    c.slli a0, 52
 ; RV64C-NEXT:    c.srai a0, 40
 ; RV64C-NEXT:    c.jr ra

diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll
@@ -180,10 +180,9 @@ define signext i32 @add32_sext_reject_on_rv64(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1
 ; RV64I-NEXT:    addiw a1, a1, -1096
-; RV64I-NEXT:    add a2, a0, a1
-; RV64I-NEXT:    lui a3, %hi(gv0)
 ; RV64I-NEXT:    addw a0, a0, a1
-; RV64I-NEXT:    sw a2, %lo(gv0)(a3)
+; RV64I-NEXT:    lui a1, %hi(gv0)
+; RV64I-NEXT:    sw a0, %lo(gv0)(a1)
 ; RV64I-NEXT:    ret
   %b = add nsw i32 %a, 3000
   store i32 %b, i32* @gv0, align 4
@@ -234,8 +233,8 @@ define void @add32_reject() nounwind {
 ; RV64I-NEXT:    lw a3, %lo(gb)(a2)
 ; RV64I-NEXT:    lui a4, 1
 ; RV64I-NEXT:    addiw a4, a4, -1096
-; RV64I-NEXT:    add a1, a1, a4
-; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    addw a1, a1, a4
+; RV64I-NEXT:    addw a3, a3, a4
 ; RV64I-NEXT:    sw a1, %lo(ga)(a0)
 ; RV64I-NEXT:    sw a3, %lo(gb)(a2)
 ; RV64I-NEXT:    ret

diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -18,7 +18,7 @@ define signext i32 @add_mul_trans_accept_1(i32 %x) {
 ; RV64IM-LABEL: add_mul_trans_accept_1:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    addi a1, zero, 11
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    addiw a0, a0, 407
 ; RV64IM-NEXT:    ret
   %tmp0 = add i32 %x, 37
@@ -39,7 +39,7 @@ define signext i32 @add_mul_trans_accept_2(i32 %x) {
 ; RV64IM-LABEL: add_mul_trans_accept_2:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    addi a1, zero, 13
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    lui a1, 28
 ; RV64IM-NEXT:    addiw a1, a1, 1701
 ; RV64IM-NEXT:    addw a0, a0, a1
@@ -62,7 +62,7 @@ define signext i32 @add_mul_trans_reject_1(i32 %x) {
 ; RV64IM-LABEL: add_mul_trans_reject_1:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    addi a1, zero, 19
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    lui a1, 9
 ; RV64IM-NEXT:    addiw a1, a1, 585
 ; RV64IM-NEXT:    addw a0, a0, a1
@@ -87,7 +87,7 @@ define signext i32 @add_mul_trans_reject_2(i32 %x) {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    lui a1, 792
 ; RV64IM-NEXT:    addiw a1, a1, -1709
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    lui a1, 1014660
 ; RV64IM-NEXT:    addiw a1, a1, -1891
 ; RV64IM-NEXT:    addw a0, a0, a1