From e71870512fd896bf6cf34e8ae650f4cf20923258 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sat, 7 Aug 2021 15:20:35 +0300
Subject: [PATCH] [InstCombine] Prefer `-(x & 1)` as the low bit splatting
 pattern (PR51305)

Both patterns are equivalent (https://alive2.llvm.org/ce/z/jfCViF),
so we should have a preference. It seems like mask+negation is better
than two shifts.
---
 .../InstCombine/InstCombineShifts.cpp         | 17 ++++++++++-
 llvm/test/Transforms/InstCombine/exact.ll     |  8 ++---
 .../Transforms/InstCombine/low-bit-splat.ll   | 29 ++++++++++---------
 llvm/test/Transforms/InstCombine/sext.ll      |  6 ++--
 4 files changed, 39 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index ca5e473fdecba..01f8e60db7e78 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1346,6 +1346,22 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
     }
   }
 
+  // Prefer `-(x & 1)` over `(x << (bitwidth(x)-1)) a>> (bitwidth(x)-1)`
+  // as the pattern to splat the lowest bit.
+  // FIXME: iff X is already masked, we don't need the one-use check.
+  Value *X;
+  if (match(Op1, m_SpecificIntAllowUndef(BitWidth - 1)) &&
+      match(Op0, m_OneUse(m_Shl(m_Value(X),
+                                m_SpecificIntAllowUndef(BitWidth - 1))))) {
+    Constant *Mask = ConstantInt::get(Ty, 1);
+    // Retain the knowledge about the ignored lanes.
+    Mask = Constant::mergeUndefsWith(
+        Constant::mergeUndefsWith(Mask, cast<Constant>(Op1)),
+        cast<Constant>(cast<Instruction>(Op0)->getOperand(1)));
+    X = Builder.CreateAnd(X, Mask);
+    return BinaryOperator::CreateNeg(X);
+  }
+
   if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
     return R;
 
@@ -1354,7 +1370,6 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
     return BinaryOperator::CreateLShr(Op0, Op1);
 
   // ashr (xor %x, -1), %y  -->  xor (ashr %x, %y), -1
-  Value *X;
   if (match(Op0, m_OneUse(m_Not(m_Value(X))))) {
     // Note that we must drop 'exact'-ness of the shift!
     // Note that we can't keep undef's in -1 vector constant!
diff --git a/llvm/test/Transforms/InstCombine/exact.ll b/llvm/test/Transforms/InstCombine/exact.ll
index 6b52dfb0380ea..e87e12cc1bea1 100644
--- a/llvm/test/Transforms/InstCombine/exact.ll
+++ b/llvm/test/Transforms/InstCombine/exact.ll
@@ -147,8 +147,8 @@ define <2 x i1> @ashr_icmp2_vec(<2 x i64> %X) {
 ; Make sure we don't transform the ashr here into an sdiv
 define i1 @pr9998(i32 %V) {
 ; CHECK-LABEL: @pr9998(
-; CHECK-NEXT:    [[W_MASK:%.*]] = and i32 [[V:%.*]], 1
-; CHECK-NEXT:    [[Z:%.*]] = icmp ne i32 [[W_MASK]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[Z:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[Z]]
 ;
   %W = shl i32 %V, 31
@@ -161,8 +161,8 @@ define i1 @pr9998(i32 %V) {
 ; FIXME: Vectors should fold the same way.
 define <2 x i1> @pr9998vec(<2 x i32> %V) {
 ; CHECK-LABEL: @pr9998vec(
-; CHECK-NEXT:    [[W:%.*]] = shl <2 x i32> [[V:%.*]], <i32 31, i32 31>
-; CHECK-NEXT:    [[X:%.*]] = ashr exact <2 x i32> [[W]], <i32 31, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[V:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[X:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[Y:%.*]] = sext <2 x i32> [[X]] to <2 x i64>
 ; CHECK-NEXT:    [[Z:%.*]] = icmp ugt <2 x i64> [[Y]], <i64 7297771788697658747, i64 7297771788697658747>
 ; CHECK-NEXT:    ret <2 x i1> [[Z]]
diff --git a/llvm/test/Transforms/InstCombine/low-bit-splat.ll b/llvm/test/Transforms/InstCombine/low-bit-splat.ll
index ce891f7d03f1e..332cb32ad1de6 100644
--- a/llvm/test/Transforms/InstCombine/low-bit-splat.ll
+++ b/llvm/test/Transforms/InstCombine/low-bit-splat.ll
@@ -9,8 +9,8 @@ declare void @use8(i8)
 ; Basic positive scalar tests
 define i8 @t0(i8 %x) {
 ; CHECK-LABEL: @t0(
-; CHECK-NEXT:    [[I0:%.*]] = shl i8 [[X:%.*]], 7
-; CHECK-NEXT:    [[R:%.*]] = ashr exact i8 [[I0]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[R:%.*]] = sub nsw i8 0, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %i0 = shl i8 %x, 7
@@ -19,8 +19,8 @@ define i8 @t0(i8 %x) {
 }
 define i16 @t1_otherbitwidth(i16 %x) {
 ; CHECK-LABEL: @t1_otherbitwidth(
-; CHECK-NEXT:    [[I0:%.*]] = shl i16 [[X:%.*]], 15
-; CHECK-NEXT:    [[R:%.*]] = ashr exact i16 [[I0]], 15
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], 1
+; CHECK-NEXT:    [[R:%.*]] = sub nsw i16 0, [[TMP1]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %i0 = shl i16 %x, 15
@@ -31,8 +31,8 @@ define i16 @t1_otherbitwidth(i16 %x) {
 ; Basic positive vector tests
 define <2 x i8> @t2_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @t2_vec(
-; CHECK-NEXT:    [[I0:%.*]] = shl <2 x i8> [[X:%.*]], <i8 7, i8 7>
-; CHECK-NEXT:    [[R:%.*]] = ashr exact <2 x i8> [[I0]], <i8 7, i8 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %i0 = shl <2 x i8> %x, <i8 7, i8 7>
@@ -41,8 +41,8 @@ define <2 x i8> @t2_vec(<2 x i8> %x) {
 }
 define <3 x i8> @t3_vec_undef0(<3 x i8> %x) {
 ; CHECK-LABEL: @t3_vec_undef0(
-; CHECK-NEXT:    [[I0:%.*]] = shl <3 x i8> [[X:%.*]], <i8 7, i8 undef, i8 7>
-; CHECK-NEXT:    [[R:%.*]] = ashr <3 x i8> [[I0]], <i8 7, i8 7, i8 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i8> [[X:%.*]], <i8 1, i8 undef, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = sub <3 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %i0 = shl <3 x i8> %x, <i8 7, i8 undef, i8 7>
@@ -51,8 +51,8 @@ define <3 x i8> @t3_vec_undef0(<3 x i8> %x) {
 }
 define <3 x i8> @t4_vec_undef1(<3 x i8> %x) {
 ; CHECK-LABEL: @t4_vec_undef1(
-; CHECK-NEXT:    [[I0:%.*]] = shl <3 x i8> [[X:%.*]], <i8 7, i8 7, i8 7>
-; CHECK-NEXT:    [[R:%.*]] = ashr <3 x i8> [[I0]], <i8 7, i8 undef, i8 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i8> [[X:%.*]], <i8 1, i8 undef, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = sub <3 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %i0 = shl <3 x i8> %x, <i8 7, i8 7, i8 7>
@@ -61,8 +61,8 @@ define <3 x i8> @t4_vec_undef1(<3 x i8> %x) {
 }
 define <3 x i8> @t5_vec_undef2(<3 x i8> %x) {
 ; CHECK-LABEL: @t5_vec_undef2(
-; CHECK-NEXT:    [[I0:%.*]] = shl <3 x i8> [[X:%.*]], <i8 7, i8 undef, i8 7>
-; CHECK-NEXT:    [[R:%.*]] = ashr <3 x i8> [[I0]], <i8 7, i8 undef, i8 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i8> [[X:%.*]], <i8 1, i8 undef, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = sub <3 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %i0 = shl <3 x i8> %x, <i8 7, i8 undef, i8 7>
@@ -89,8 +89,8 @@ define i8 @t7_already_masked(i8 %x) {
 ; CHECK-LABEL: @t7_already_masked(
 ; CHECK-NEXT:    [[I0:%.*]] = and i8 [[X:%.*]], 1
 ; CHECK-NEXT:    call void @use8(i8 [[I0]])
-; CHECK-NEXT:    [[I1:%.*]] = shl i8 [[X]], 7
-; CHECK-NEXT:    [[R:%.*]] = ashr exact i8 [[I1]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], 1
+; CHECK-NEXT:    [[R:%.*]] = sub nsw i8 0, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %i0 = and i8 %x, 1
@@ -99,6 +99,7 @@ define i8 @t7_already_masked(i8 %x) {
   %r = ashr i8 %i1, 7
   ret i8 %r
 }
+; FIXME: we should fold this
 define i8 @t8_already_masked_extrause(i8 %x) {
 ; CHECK-LABEL: @t8_already_masked_extrause(
 ; CHECK-NEXT:    [[I0:%.*]] = and i8 [[X:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/sext.ll b/llvm/test/Transforms/InstCombine/sext.ll
index 04573e2e8ddca..1fc645cc9289a 100644
--- a/llvm/test/Transforms/InstCombine/sext.ll
+++ b/llvm/test/Transforms/InstCombine/sext.ll
@@ -306,8 +306,10 @@ define i32 @test18(i16 %x) {
 
 define i10 @test19(i10 %i) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    [[D1:%.*]] = shl i10 [[I:%.*]], 9
-; CHECK-NEXT:    [[D:%.*]] = ashr exact i10 [[D1]], 9
+; CHECK-NEXT:    [[A:%.*]] = trunc i10 [[I:%.*]] to i3
+; CHECK-NEXT:    [[TMP1:%.*]] = and i3 [[A]], 1
+; CHECK-NEXT:    [[C:%.*]] = sub nsw i3 0, [[TMP1]]
+; CHECK-NEXT:    [[D:%.*]] = sext i3 [[C]] to i10
 ; CHECK-NEXT:    ret i10 [[D]]
 ;
   %a = trunc i10 %i to i3