[x86] promote all multiply i8 by constant to i32

We have these 2 "isDesirable" promotion hooks (I'm not sure why we need both of them, but that's independent of this patch), and we can adjust them to promote "mul i8 X, C" to i32. Then, all of our existing LEA and other multiply expansion magic happens as it would for i32 ops. Some of the test diffs show that we could end up with an actual 32-bit mul instruction here because we choose not to expand to simpler ops. That instruction could be slower depending on the subtarget. On the plus side, this means we don't need a separate instruction to load the constant operand and possibly an extra instruction to move the result. If we need to tune mul i32 further, we could add a later transform that tries to shrink it back to i8 based on subtarget timing. I did not bother to duplicate all of the 32-bit test file RUNs and target settings that exist to test whether LEA expansion is cheap or not. The diffs here assume a default target, so that means LEA is generally cheap. Differential Revision: https://reviews.llvm.org/D54803 llvm-svn: 347557
llvm · Nov 26, 2018 · d31220e · d31220e
1 parent 2447baf
commit d31220e
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 155 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41060,10 +41060,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-/// Return true if the target has native support for the specified value type
-/// and it is 'desirable' to use the type for the given node type. e.g. On x86
-/// i16 is legal, but undesirable since i16 instruction encodings are longer and
-/// some i16 instructions are slow.
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
@@ -41072,26 +41068,37 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
     return false;
 
-  if (VT != MVT::i16)
-    return true;
-
-  switch (Opc) {
-  default:
-    return true;
-  case ISD::LOAD:
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-  case ISD::SHL:
-  case ISD::SRL:
-  case ISD::SUB:
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
+  // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
+  // we have specializations to turn 32-bit multiply into LEA or other ops.
+  // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
+  // check for a constant operand to the multiply.
+  if (Opc == ISD::MUL && VT == MVT::i8)
     return false;
+
+  // i16 instruction encodings are longer and some i16 instructions are slow,
+  // so those are not desirable.
+  if (VT == MVT::i16) {
+    switch (Opc) {
+    default:
+      break;
+    case ISD::LOAD:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SUB:
+    case ISD::ADD:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      return false;
+    }
   }
+
+  // Any legal type not explicitly accounted for above here is desirable.
+  return true;
 }
 
 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
@@ -41110,12 +41117,16 @@ SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
 }
 
-/// This method query the target whether it is beneficial for dag combiner to
-/// promote the specified node. If true, it should return the desired promotion
-/// type by reference.
 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   EVT VT = Op.getValueType();
-  if (VT != MVT::i16)
+  bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
+                             isa<ConstantSDNode>(Op.getOperand(1));
+
+  // i16 is legal, but undesirable since i16 instruction encodings are longer
+  // and some i16 instructions are slow.
+  // 8-bit multiply-by-constant can usually be expanded to something cheaper
+  // using LEA and/or other ALU ops.
+  if (VT != MVT::i16 && !Is8BitMulByConstant)
     return false;
 
   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

diff --git a/llvm/test/CodeGen/X86/ipra-reg-alias.ll b/llvm/test/CodeGen/X86/ipra-reg-alias.ll
@@ -1,19 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-- -enable-ipra -print-regusage -o - 2>&1 < %s | FileCheck %s --check-prefix=DEBUG
 ; RUN: llc -mtriple=x86_64-- -enable-ipra -o - < %s | FileCheck %s
 
-; Here only CL is clobbered so CH should not be clobbred, but CX, ECX and RCX
-; should be clobbered.
-; DEBUG: main Clobbered Registers: $ah $al $ax $cl $cx $eax $ecx $eflags $hax $rax $rcx
-
 define i8 @main(i8 %X) {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movb $5, %cl
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    mulb %cl
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (%rdi,%rdi,4), %eax
 ; CHECK-NEXT:    addb $5, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %inc = add i8 %X, 1
   %inc2 = mul i8 %inc, 5

diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -518,33 +518,29 @@ define <2 x i64> @urem_op0_constant(i64* %p) nounwind {
 define <16 x i8> @urem_op1_constant(i8* %p) nounwind {
 ; SSE-LABEL: urem_op1_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movb (%rdi), %cl
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    shrb %al
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrb %cl
+; SSE-NEXT:    movzbl %cl, %ecx
+; SSE-NEXT:    imull $49, %ecx, %ecx
+; SSE-NEXT:    shrl $10, %ecx
+; SSE-NEXT:    imull $42, %ecx, %ecx
+; SSE-NEXT:    subb %cl, %al
 ; SSE-NEXT:    movzbl %al, %eax
-; SSE-NEXT:    imull $49, %eax, %eax
-; SSE-NEXT:    shrl $10, %eax
-; SSE-NEXT:    movb $42, %dl
-; SSE-NEXT:    # kill: def $al killed $al killed $eax
-; SSE-NEXT:    mulb %dl
-; SSE-NEXT:    subb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: urem_op1_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movb (%rdi), %cl
-; AVX-NEXT:    movl %ecx, %eax
-; AVX-NEXT:    shrb %al
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrb %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    imull $49, %ecx, %ecx
+; AVX-NEXT:    shrl $10, %ecx
+; AVX-NEXT:    imull $42, %ecx, %ecx
+; AVX-NEXT:    subb %cl, %al
 ; AVX-NEXT:    movzbl %al, %eax
-; AVX-NEXT:    imull $49, %eax, %eax
-; AVX-NEXT:    shrl $10, %eax
-; AVX-NEXT:    movb $42, %dl
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    mulb %dl
-; AVX-NEXT:    subb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
 ; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    retq
   %x = load i8, i8* %p