[X86] Improve shift combining

This folds (ashr (shl a, [56,48,32,24,16]), SarConst) into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or into (lshr, (sext (a), SarConst - [56,48,32,24,16])) depending on sign of (SarConst - [56,48,32,24,16]) sexts in X86 are MOVs. The MOVs have the same code size as above SHIFTs (only SHIFT by 1 has lower code size). However the MOVs have 2 advantages to SHIFTs on x86: 1. MOVs can write to a register that differs from source. 2. MOVs accept memory operands. This fixes PR24373. Patch by: evgeny.v.stupachenko@intel.com Differential Revision: http://reviews.llvm.org/D13161 llvm-svn: 255761
llvm · Dec 16, 2015 · e75e6e2 · e75e6e2
1 parent 94f181a
commit e75e6e2
Show file tree

Hide file tree

Showing 5 changed files with 1,104 additions and 854 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24950,6 +24950,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned Size = VT.getSizeInBits();
+
+  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+  // depending on sign of (SarConst - [56,48,32,24,16])
+
+  // sexts in X86 are MOVs. The MOVs have the same code size
+  // as above SHIFTs (only SHIFT on 1 has lower code size).
+  // However the MOVs have 2 advantages to a SHIFT:
+  // 1. MOVs can write to a register that differs from source
+  // 2. MOVs accept memory operands
+
+  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+      N0.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+  EVT CVT = N1.getValueType();
+
+  if (SarConst.isNegative())
+    return SDValue();
+
+  for (MVT SVT : MVT::integer_valuetypes()) {
+    unsigned ShiftSize = SVT.getSizeInBits();
+    // skipping types without corresponding sext/zext and
+    // ShlConst that is not one of [56,48,32,24,16]
+    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+      continue;
+    SDLoc DL(N);
+    SDValue NN =
+        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+    SarConst = SarConst - (Size - ShiftSize);
+    if (SarConst == 0)
+      return NN;
+    else if (SarConst.isNegative())
+      return DAG.getNode(ISD::SHL, DL, VT, NN,
+                         DAG.getConstant(-SarConst, DL, CVT));
+    else
+      return DAG.getNode(ISD::SRA, DL, VT, NN,
+                         DAG.getConstant(SarConst, DL, CVT));
+  }
+  return SDValue();
+}
+
 /// \brief Returns a vector of 0s if the node in input is a vector logical
 /// shift by a constant amount which is known to be bigger than or equal
 /// to the vector element size in bits.
@@ -24989,6 +25042,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
     if (SDValue V = PerformSHLCombine(N, DAG))
       return V;
 
+  if (N->getOpcode() == ISD::SRA)
+    if (SDValue V = PerformSRACombine(N, DAG))
+      return V;
+
   // Try to fold this logical shift into a zero vector.
   if (N->getOpcode() != ISD::SRA)
     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))

diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -4,15 +4,23 @@
 ; a shr (X, -8) that gets subsequently "optimized away" as undef
 ; PR4254
 
+; after fixing PR24373
+; shlq $56, %rdi
+; sarq $48, %rdi
+; folds into
+; movsbq %dil, %rax
+; shlq $8, %rax
+; which is better for x86
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
 entry:
 ; CHECK-LABEL: foo:
-; CHECK: shlq $56, %rdi
-; CHECK: sarq $48, %rdi
-; CHECK: leaq 1(%rdi), %rax
+; CHECK: movsbq %dil, %rax
+; CHECK: shlq $8, %rax
+; CHECK: orq $1, %rax
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]

diff --git a/llvm/test/CodeGen/X86/sar_fold.ll b/llvm/test/CodeGen/X86/sar_fold.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
+
+define i32 @shl16sar15(i32 %a) #0 {
+; CHECK-LABEL: shl16sar15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 16
+  %2 = ashr exact i32 %1, 15
+  ret i32 %2
+}
+
+define i32 @shl16sar17(i32 %a) #0 {
+; CHECK-LABEL: shl16sar17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 16
+  %2 = ashr exact i32 %1, 17
+  ret i32 %2
+}
+
+define i32 @shl24sar23(i32 %a) #0 {
+; CHECK-LABEL: shl24sar23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 24
+  %2 = ashr exact i32 %1, 23
+  ret i32 %2
+}
+
+define i32 @shl24sar25(i32 %a) #0 {
+; CHECK-LABEL: shl24sar25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+  %1 = shl i32 %a, 24
+  %2 = ashr exact i32 %1, 25
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i32 @shl48sar47(i64 %a) #0 {
+; CHECK-LABEL: shl48sar47:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswq %di, %rax
+  %1 = shl i64 %a, 48
+  %2 = ashr exact i64 %1, 47
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl48sar49(i64 %a) #0 {
+; CHECK-LABEL: shl48sar49:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movswq %di, %rax
+  %1 = shl i64 %a, 48
+  %2 = ashr exact i64 %1, 49
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl56sar55(i64 %a) #0 {
+; CHECK-LABEL: shl56sar55:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbq %dil, %rax
+  %1 = shl i64 %a, 56
+  %2 = ashr exact i64 %1, 55
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+define i32 @shl56sar57(i64 %a) #0 {
+; CHECK-LABEL: shl56sar57:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbq %dil, %rax
+  %1 = shl i64 %a, 56
+  %2 = ashr exact i64 %1, 57
+  %3 = trunc i64 %2 to i32
+  ret i32 %3
+}
+
+attributes #0 = { nounwind }