[InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs #84389

goldsteinn · 2024-03-07T22:20:54Z

[InstCombine] Add more tests for transforming `(binop (uitofp)
[InstCombine] Make (binop ({s|u}itofp),({s|u}itofp)) transform more flexible to mismatched signs

llvmbot · 2024-03-07T22:21:23Z

@llvm/pr-subscribers-llvm-transforms

Author: None (goldsteinn)

Changes

[InstCombine] Add more tests for transforming `(binop (uitofp)
[InstCombine] Make `(binop ({s|u}itofp)

Full diff: https://github.com/llvm/llvm-project/pull/84389.diff

4 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineInternal.h (+4)
(modified) llvm/lib/Transforms/InstCombine/InstructionCombining.cpp (+52-35)
(modified) llvm/test/Transforms/InstCombine/add-sitofp.ll (+5-5)
(modified) llvm/test/Transforms/InstCombine/binop-itofp.ll (+31-23)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 57148d719d9b61..a8bf40602d145c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -380,6 +380,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
   Instruction *foldFBinOpOfIntCasts(BinaryOperator &I);
+  // Should only be called by `foldFBinOpOfIntCasts`.
+  Instruction *foldFBinOpOfIntCastsFromSign(
+      BinaryOperator &BO, bool OpsFromSigned, SmallVector<Value *, 2> IntOps,
+      Constant *Op1FpC, SmallVectorImpl<WithCache<const Value *>> *OpsKnown);
   Instruction *foldBinopOfSextBoolToSelect(BinaryOperator &I);
   Instruction *narrowBinOp(TruncInst &Trunc);
   Instruction *narrowMaskedBinOp(BinaryOperator &And);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f3a740c1b16116..7d7cd980f7353f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1406,57 +1406,48 @@ Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
 //        -> ({s|u}itofp (int_binop x, y))
 //    2) (fp_binop ({s|u}itofp x), FpC)
 //        -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
-Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
-  Value *IntOps[2] = {nullptr, nullptr};
-  Constant *Op1FpC = nullptr;
-
-  // Check for:
-  //    1) (binop ({s|u}itofp x), ({s|u}itofp y))
-  //    2) (binop ({s|u}itofp x), FpC)
-  if (!match(BO.getOperand(0), m_SIToFP(m_Value(IntOps[0]))) &&
-      !match(BO.getOperand(0), m_UIToFP(m_Value(IntOps[0]))))
-    return nullptr;
-
-  if (!match(BO.getOperand(1), m_Constant(Op1FpC)) &&
-      !match(BO.getOperand(1), m_SIToFP(m_Value(IntOps[1]))) &&
-      !match(BO.getOperand(1), m_UIToFP(m_Value(IntOps[1]))))
-    return nullptr;
+//
+// Assuming the sign of the cast for x/y is `OpsFromSigned`.
+Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
+    BinaryOperator &BO, bool OpsFromSigned, SmallVector<Value *, 2> IntOps,
+    Constant *Op1FpC, SmallVectorImpl<WithCache<const Value *>> *OpsKnown) {
 
   Type *FPTy = BO.getType();
   Type *IntTy = IntOps[0]->getType();
 
-  // Do we have signed casts?
-  bool OpsFromSigned = isa<SIToFPInst>(BO.getOperand(0));
-
   unsigned IntSz = IntTy->getScalarSizeInBits();
   // This is the maximum number of inuse bits by the integer where the int -> fp
   // casts are exact.
   unsigned MaxRepresentableBits =
       APFloat::semanticsPrecision(FPTy->getScalarType()->getFltSemantics());
 
-  // Cache KnownBits a bit to potentially save some analysis.
-  WithCache<const Value *> OpsKnown[2] = {IntOps[0], IntOps[1]};
-
   // Preserve known number of leading bits. This can allow us to trivial nsw/nuw
   // checks later on.
   unsigned NumUsedLeadingBits[2] = {IntSz, IntSz};
 
+  // NB: This only comes up is OpsFromSigned is true, so there is no need to
+  // cache is between calls to `foldFBinOpOfIntCastsFromSign`.
   auto IsNonZero = [&](unsigned OpNo) -> bool {
-    if (OpsKnown[OpNo].hasKnownBits() &&
-        OpsKnown[OpNo].getKnownBits(SQ).isNonZero())
+    if ((*OpsKnown)[OpNo].hasKnownBits() &&
+        (*OpsKnown)[OpNo].getKnownBits(SQ).isNonZero())
       return true;
     return isKnownNonZero(IntOps[OpNo], SQ.DL);
   };
 
   auto IsNonNeg = [&](unsigned OpNo) -> bool {
-    if (OpsKnown[OpNo].hasKnownBits() &&
-        OpsKnown[OpNo].getKnownBits(SQ).isNonNegative())
-      return true;
-    return isKnownNonNegative(IntOps[OpNo], SQ);
+    // NB: This matches the impl in ValueTracking, we just try to use cached
+    // knownbits here. If we ever start supporting WithCache for
+    // `isKnownNonNegative`, change this to an explicitly call.
+    return (*OpsKnown)[OpNo].getKnownBits(SQ).isNonNegative();
   };
 
   // Check if we know for certain that ({s|u}itofp op) is exact.
   auto IsValidPromotion = [&](unsigned OpNo) -> bool {
+    // Can we treat this operand as the desired sign?
+    if (OpsFromSigned != isa<SIToFPInst>(BO.getOperand(OpNo)) &&
+        !IsNonNeg(OpNo))
+      return false;
+
     // If fp precision >= bitwidth(op) then its exact.
     // NB: This is slightly conservative for `sitofp`. For signed conversion, we
     // can handle `MaxRepresentableBits == IntSz - 1` as the sign bit will be
@@ -1473,7 +1464,7 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
       // numLeadingZeros(op).
       else {
         NumUsedLeadingBits[OpNo] =
-            IntSz - OpsKnown[OpNo].getKnownBits(SQ).countMinLeadingZeros();
+            IntSz - (*OpsKnown)[OpNo].getKnownBits(SQ).countMinLeadingZeros();
       }
     }
     // NB: We could also check if op is known to be a power of 2 or zero (which
@@ -1509,13 +1500,6 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
     return nullptr;
 
   if (Op1FpC == nullptr) {
-    if (OpsFromSigned != isa<SIToFPInst>(BO.getOperand(1))) {
-      // If we have a signed + unsigned, see if we can treat both as signed
-      // (uitofp nneg x) == (sitofp nneg x).
-      if (OpsFromSigned ? !IsNonNeg(1) : !IsNonNeg(0))
-        return nullptr;
-      OpsFromSigned = true;
-    }
     if (!IsValidPromotion(1))
       return nullptr;
   }
@@ -1574,6 +1558,39 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
   return new UIToFPInst(IntBinOp, FPTy);
 }
 
+// Try to fold:
+//    1) (fp_binop ({s|u}itofp x), ({s|u}itofp y))
+//        -> ({s|u}itofp (int_binop x, y))
+//    2) (fp_binop ({s|u}itofp x), FpC)
+//        -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
+Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
+  SmallVector<Value *, 2> IntOps = {nullptr, nullptr};
+  Constant *Op1FpC = nullptr;
+  // Check for:
+  //    1) (binop ({s|u}itofp x), ({s|u}itofp y))
+  //    2) (binop ({s|u}itofp x), FpC)
+  if (!match(BO.getOperand(0), m_SIToFP(m_Value(IntOps[0]))) &&
+      !match(BO.getOperand(0), m_UIToFP(m_Value(IntOps[0]))))
+    return nullptr;
+
+  if (!match(BO.getOperand(1), m_Constant(Op1FpC)) &&
+      !match(BO.getOperand(1), m_SIToFP(m_Value(IntOps[1]))) &&
+      !match(BO.getOperand(1), m_UIToFP(m_Value(IntOps[1]))))
+    return nullptr;
+
+  // Cache KnownBits a bit to potentially save some analysis.
+  SmallVector<WithCache<const Value *>, 2> OpsKnown = {IntOps[0], IntOps[1]};
+
+  // Try treating x/y as coming from both `uitofp` and `sitofp`. There are
+  // different constraints depending on the sign of the cast.
+  // NB: `(uitofp nneg X)` == `(sitofp nneg X)`.
+  if (Instruction *R = foldFBinOpOfIntCastsFromSign(BO, /*OpsFromSigned=*/false,
+                                                    IntOps, Op1FpC, &OpsKnown))
+    return R;
+  return foldFBinOpOfIntCastsFromSign(BO, /*OpsFromSigned=*/true, IntOps,
+                                      Op1FpC, &OpsKnown);
+}
+
 /// A binop with a constant operand and a sign-extended boolean operand may be
 /// converted into a select of constants by applying the binary operation to
 /// the constant with the two possible values of the extended boolean (0 or -1).
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index 049db8c84a522d..2bdc808d9771c4 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -6,7 +6,7 @@ define double @x(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
 ; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1
-; CHECK-NEXT:    [[P:%.*]] = sitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[P:%.*]] = uitofp i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[P]]
 ;
   %m = lshr i32 %a, 24
@@ -20,7 +20,7 @@ define double @test(i32 %a) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], 1
-; CHECK-NEXT:    [[RES:%.*]] = sitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[RES:%.*]] = uitofp i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + 1 doesn't overflow
@@ -49,7 +49,7 @@ define double @test_2(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = sitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[RES:%.*]] = uitofp i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
@@ -89,7 +89,7 @@ define float @test_3(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
 ; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1
-; CHECK-NEXT:    [[P:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[P:%.*]] = uitofp i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[P]]
 ;
   %m = lshr i32 %a, 24
@@ -104,7 +104,7 @@ define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index ffa89374579145..7d2b872985d5c2 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -110,7 +110,7 @@ define half @test_ui_si_i8_add(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 63
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -125,9 +125,8 @@ define half @test_ui_si_i8_add_overflow(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-LABEL: @test_ui_si_i8_add_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 65
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i8 [[Y]] to half
-; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], [[YF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -152,9 +151,8 @@ define half @test_ui_ui_i8_sub_C(i8 noundef %x_in) {
 
 define half @test_ui_ui_i8_sub_C_fail_overflow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_sub_C_fail_overflow(
-; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 127
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
-; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], 0xHD800
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X_IN:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 127
@@ -212,8 +210,8 @@ define half @test_si_si_i8_sub_C(i8 noundef %x_in) {
 define half @test_si_si_i8_sub_C_fail_overflow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_sub_C_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 65
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
-; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], 0xH5400
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i8 [[X]], 64
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 65
@@ -242,9 +240,8 @@ define half @test_ui_si_i8_sub_fail_maybe_sign(i8 noundef %x_in, i8 noundef %y_i
 ; CHECK-LABEL: @test_ui_si_i8_sub_fail_maybe_sign(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[X_IN:%.*]], 64
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 63
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = sitofp i8 [[Y]] to half
-; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = or i8 %x_in, 64
@@ -273,8 +270,8 @@ define half @test_ui_ui_i8_mul(i8 noundef %x_in, i8 noundef %y_in) {
 
 define half @test_ui_ui_i8_mul_C(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_mul_C(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X_IN:%.*]], 4
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[X:%.*]] = shl i8 [[X_IN:%.*]], 4
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[X]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 15
@@ -368,7 +365,7 @@ define half @test_ui_si_i8_mul(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i8 [[Y_IN:%.*]], 7
 ; CHECK-NEXT:    [[Y:%.*]] = add nuw nsw i8 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i8 %x_in, 6
@@ -386,9 +383,8 @@ define half @test_ui_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 7
 ; CHECK-NEXT:    [[X:%.*]] = add nuw nsw i8 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 7
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i8 [[Y]] to half
-; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i8 %x_in, 7
@@ -694,7 +690,7 @@ define half @test_ui_si_i16_mul(i16 noundef %x_in, i16 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i16 [[Y_IN:%.*]], 126
 ; CHECK-NEXT:    [[Y:%.*]] = or disjoint i16 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i16 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp i16 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i16 %x_in, 126
@@ -807,9 +803,8 @@ define half @test_ui_ui_i12_sub_fail_overflow(i12 noundef %x_in, i12 noundef %y_
 ; CHECK-LABEL: @test_ui_ui_i12_sub_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 1023
 ; CHECK-NEXT:    [[Y:%.*]] = and i12 [[Y_IN:%.*]], 2047
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i12 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i12 [[Y]] to half
-; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i12 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i12 %x_in, 1023
@@ -984,7 +979,7 @@ define half @test_ui_si_i12_mul_nsw(i12 noundef %x_in, i12 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i12 [[Y_IN:%.*]], 30
 ; CHECK-NEXT:    [[Y:%.*]] = or disjoint i12 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i12 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i12 %x_in, 31
@@ -996,3 +991,16 @@ define half @test_ui_si_i12_mul_nsw(i12 noundef %x_in, i12 noundef %y_in) {
   %r = fmul half %xf, %yf
   ret half %r
 }
+
+define float @test_ui_add_with_signed_constant(i32 %shr.i) {
+; CHECK-LABEL: @test_ui_add_with_signed_constant(
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I:%.*]], 32767
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[AND_I]], -16383
+; CHECK-NEXT:    [[ADD:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %and.i = and i32 %shr.i, 32767
+  %sub = uitofp i32 %and.i to float
+  %add = fadd float %sub, -16383.0
+  ret float %add
+}

PR Link: llvm/llvm-project#84389

dtcxzyw · 2024-03-08T11:06:42Z

Could you please have a look at dtcxzyw/llvm-opt-benchmark#336 (comment)?

goldsteinn · 2024-03-08T18:28:53Z

Could you please have a look at dtcxzyw/llvm-opt-benchmark#336 (comment)?

So it seems this transform is enabling SLP vectorization in a case where its not profitable:

;; Before
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
;; After
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %fadd0 = sitofp i16 %1 to float
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %2 = insertelement <2 x i16> poison, i16 %inp, i32 0
  %3 = insertelement <2 x i16> %2, i16 %1, i32 1
  %4 = uitofp <2 x i16> %3 to <2 x float>
  %5 = sitofp <2 x i16> %3 to <2 x float>
  %6 = shufflevector <2 x float> %4, <2 x float> %5, <2 x i32> <i32 0, i32 3>
  %7 = fdiv <2 x float> %6, zeroinitializer
  %8 = extractelement <2 x float> %7, i32 0
  %9 = extractelement <2 x float> %7, i32 1
  %r = fmul float %8, %9
  ret float %r
}

looking into a fix.

goldsteinn · 2024-03-08T18:35:09Z

Could you please have a look at dtcxzyw/llvm-opt-benchmark#336 (comment)?

So it seems this transform is enabling SLP vectorization in a case where its not profitable:

;; Before
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
;; After
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %fadd0 = sitofp i16 %1 to float
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %2 = insertelement <2 x i16> poison, i16 %inp, i32 0
  %3 = insertelement <2 x i16> %2, i16 %1, i32 1
  %4 = uitofp <2 x i16> %3 to <2 x float>
  %5 = sitofp <2 x i16> %3 to <2 x float>
  %6 = shufflevector <2 x float> %4, <2 x float> %5, <2 x i32> <i32 0, i32 3>
  %7 = fdiv <2 x float> %6, zeroinitializer
  %8 = extractelement <2 x float> %7, i32 0
  %9 = extractelement <2 x float> %7, i32 1
  %r = fmul float %8, %9
  ret float %r
}

looking into a fix.

Actually, I think its profitable because it saves a division (now vectorized). For example on x86:

0000000000000000 <before>:
       0: 0f b7 c7                     	movzwl	%di, %eax
       3: 69 c8 cd cc 00 00            	imull	$0xcccd, %eax, %ecx     # imm = 0xCCCD
       9: c1 e9 12                     	shrl	$0x12, %ecx
       c: 83 e1 fc                     	andl	$-0x4, %ecx
       f: 8d 0c 89                     	leal	(%rcx,%rcx,4), %ecx
      12: 29 cf                        	subl	%ecx, %edi
      14: 0f bf cf                     	movswl	%di, %ecx
      17: c5 fa 2a c1                  	vcvtsi2ss	%ecx, %xmm0, %xmm0
      1b: c5 fa 58 05 00 00 00 00      	vaddss	(%rip), %xmm0, %xmm0    # 0x23 <before+0x23>
      23: c5 f0 57 c9                  	vxorps	%xmm1, %xmm1, %xmm1
      27: c5 fa 5e c1                  	vdivss	%xmm1, %xmm0, %xmm0
      2b: c5 ea 2a d0                  	vcvtsi2ss	%eax, %xmm2, %xmm2
      2f: c5 ea 5e c9                  	vdivss	%xmm1, %xmm2, %xmm1
      33: c5 f2 59 c0                  	vmulss	%xmm0, %xmm1, %xmm0
      37: c3                           	retq
      38: 0f 1f 84 00 00 00 00 00      	nopl	(%rax,%rax)

0000000000000040 <after>:
      40: 0f b7 c7                     	movzwl	%di, %eax
      43: 69 c0 cd cc 00 00            	imull	$0xcccd, %eax, %eax     # imm = 0xCCCD
      49: c1 e8 12                     	shrl	$0x12, %eax
      4c: 83 e0 fc                     	andl	$-0x4, %eax
      4f: 8d 04 80                     	leal	(%rax,%rax,4), %eax
      52: f7 d8                        	negl	%eax
      54: 8d 44 07 f6                  	leal	-0xa(%rdi,%rax), %eax
      58: c5 f9 6e c7                  	vmovd	%edi, %xmm0
      5c: c5 f9 c4 c0 01               	vpinsrw	$0x1, %eax, %xmm0, %xmm0
      61: c4 e2 79 33 c8               	vpmovzxwd	%xmm0, %xmm1    # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
      66: c5 f8 5b c9                  	vcvtdq2ps	%xmm1, %xmm1
      6a: c4 e2 79 23 c0               	vpmovsxwd	%xmm0, %xmm0
      6f: c5 f8 5b c0                  	vcvtdq2ps	%xmm0, %xmm0
      73: c4 e3 71 0c c0 02            	vblendps	$0x2, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
      79: c5 f0 57 c9                  	vxorps	%xmm1, %xmm1, %xmm1
      7d: c5 f8 5e c1                  	vdivps	%xmm1, %xmm0, %xmm0
      81: c5 fa 16 c8                  	vmovshdup	%xmm0, %xmm1    # xmm1 = xmm0[1,1,3,3]
      85: c5 fa 59 c1                  	vmulss	%xmm1, %xmm0, %xmm0
      89: c3                           	retq

The LLVM MCA summaries are:

BEFORE:
Iterations:        100
Instructions:      1500
Total Cycles:      2418
Total uOps:        2000

AFTER:
Iterations:        100
Instructions:      1900
Total Cycles:      590
Total uOps:        2200

goldsteinn · 2024-03-08T18:48:54Z

Could you please have a look at dtcxzyw/llvm-opt-benchmark#336 (comment)?

So it seems this transform is enabling SLP vectorization in a case where its not profitable:

;; Before
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %ui0 = uitofp i16 %r0 to float
  %fadd0 = fadd float %ui0, -1.000000e+01
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
;; After
; *** IR Dump After SimplifyCFGPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %fadd0 = sitofp i16 %1 to float
  %fdiv0 = fdiv float %fadd0, 0.000000e+00
  %ui1 = uitofp i16 %inp to float
  %fdiv1 = fdiv float %ui1, 0.000000e+00
  %r = fmul float %fdiv1, %fdiv0
  ret float %r
}
; *** IR Dump After SLPVectorizerPass on regress ***
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define float @regress(i16 %inp) local_unnamed_addr #0 {
  %r0 = urem i16 %inp, 20
  %1 = add nsw i16 %r0, -10
  %2 = insertelement <2 x i16> poison, i16 %inp, i32 0
  %3 = insertelement <2 x i16> %2, i16 %1, i32 1
  %4 = uitofp <2 x i16> %3 to <2 x float>
  %5 = sitofp <2 x i16> %3 to <2 x float>
  %6 = shufflevector <2 x float> %4, <2 x float> %5, <2 x i32> <i32 0, i32 3>
  %7 = fdiv <2 x float> %6, zeroinitializer
  %8 = extractelement <2 x float> %7, i32 0
  %9 = extractelement <2 x float> %7, i32 1
  %r = fmul float %8, %9
  ret float %r
}

It seems without the transform, the fadd prevents SLP i.e

BEFORE:
Operand 0:
    %ui0 = uitofp i16 %r0 to float
    %ui1 = uitofp i16 %inp to float
Operand 1:
  float -1.000000e+01
  float 0.000000e+00
Scalars: 
    %fadd0 = fadd float %ui0, -1.000000e+01
    %fdiv1 = fdiv float %ui1, 0.000000e+00
State: Vectorize
MainOp:   %fadd0 = fadd float %ui0, -1.000000e+01
AltOp:   %fdiv1 = fdiv float %ui1, 0.000000e+00
VectorizedValue: NULL
ReuseShuffleIndices: Empty
ReorderIndices: 
UserTreeIndices: 
SLP: Costs:
SLP:     ReuseShuffleCost = 0
SLP:     VectorCost = 6
SLP:     ScalarCost = 5
SLP:     ReuseShuffleCost + VecCost - ScalarCost = 1
SLP: Adding cost 1 for bundle n=2 [  %fadd0 = fadd float %ui0, -1.000000e+01, ..].
SLP: Current total cost = 1
SLP: Calculated costs for Tree:

AFTER:
0.
Operand 0:
    %fadd0 = sitofp i16 %1 to float
    %ui1 = uitofp i16 %inp to float
Operand 1:
  float 0.000000e+00
  float 0.000000e+00
Scalars: 
    %fdiv0 = fdiv float %fadd0, 0.000000e+00
    %fdiv1 = fdiv float %ui1, 0.000000e+00
State: Vectorize
MainOp:   %fdiv0 = fdiv float %fadd0, 0.000000e+00
AltOp:   %fdiv0 = fdiv float %fadd0, 0.000000e+00
VectorizedValue: NULL
ReuseShuffleIndices: Empty
ReorderIndices: 
UserTreeIndices: 
SLP: Costs:
SLP:     ReuseShuffleCost = 0
SLP:     VectorCost = 4
SLP:     ScalarCost = 8
SLP:     ReuseShuffleCost + VecCost - ScalarCost = -4
SLP: Adding cost -4 for bundle n=2 [  %fdiv0 = fdiv float %fadd0, 0.000000e+00, ..].
SLP: Current total cost = -4
SLP: Calculated costs for Tree:

dtcxzyw

LGTM w/ a nit. Please wait for additional approval from other reviewers.

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

… NFC

… flexible to mismatched signs Instead of taking the sign of the cast operation as the required since for the transform, only force a sign if an operation is maybe negative. This gives us more flexability when checking if the floats are safely converable to integers.

nikic · 2024-03-09T09:06:43Z

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

  // Preserve known number of leading bits. This can allow us to trivial nsw/nuw
  // checks later on.
  unsigned NumUsedLeadingBits[2] = {IntSz, IntSz};

+  // NB: This only comes up is OpsFromSigned is true, so there is no need to


Suggested change

// NB: This only comes up is OpsFromSigned is true, so there is no need to

// NB: This only comes up if OpsFromSigned is true, so there is no need to

nikic · 2024-03-09T09:07:09Z

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

  // Preserve known number of leading bits. This can allow us to trivial nsw/nuw
  // checks later on.
  unsigned NumUsedLeadingBits[2] = {IntSz, IntSz};

+  // NB: This only comes up is OpsFromSigned is true, so there is no need to
+  // cache is between calls to `foldFBinOpOfIntCastsFromSign`.


Suggested change

// cache is between calls to `foldFBinOpOfIntCastsFromSign`.

// cache it between calls to `foldFBinOpOfIntCastsFromSign`.

nikic · 2024-03-09T09:07:39Z

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

-    return isKnownNonNegative(IntOps[OpNo], SQ);
+    // NB: This matches the impl in ValueTracking, we just try to use cached
+    // knownbits here. If we ever start supporting WithCache for
+    // `isKnownNonNegative`, change this to an explicitly call.


Suggested change

// `isKnownNonNegative`, change this to an explicitly call.

// `isKnownNonNegative`, change this to an explicit call.

…element is demanded This came as a result of PR llvm#84389. SLP vectorizer can vectorize in a pattern like: ``` (blend (vec_ops0... (insert ?,X,0)), (vec_ops1... (insert ?,Y,1)) ) ``` In this case, `vec_ops0...` and `vec_ops1...` are essentially doing scalar transforms. We previously we handle things like: `(blend (insert ?,X,0), (insert ?,Y,0))` This patch extends that to look through `vec_ops...` that can be scalarized, and if its possible to scalarize all ops, it transforms the input to: ``` (blend (insert ?,(scalar_ops0... X), 0), (insert ?,(scalar_ops1... Y), 0) ) ```

goldsteinn requested a review from nikic as a code owner March 7, 2024 22:20

llvmbot added the llvm:transforms label Mar 7, 2024

goldsteinn changed the title ~~goldsteinn/fp int binop sign agnostic~~ [InstCombine] Make (binop ({s|u}itofp),({s|u}itofp)) transform more flexible to mismatched signs Mar 7, 2024

goldsteinn requested review from dtcxzyw and arsenm March 7, 2024 22:21

goldsteinn mentioned this pull request Mar 7, 2024

[InstCombine] Canonicalize (sitofp x) -> (uitofp x) if x >= 0 #82404

Closed

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Mar 8, 2024

pre-commit: test PR84389

939b8b0

PR Link: llvm/llvm-project#84389

dtcxzyw mentioned this pull request Mar 8, 2024

pre-commit: test PR84389 dtcxzyw/llvm-opt-benchmark#336

Closed

dtcxzyw approved these changes Mar 8, 2024

View reviewed changes

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp Outdated Show resolved Hide resolved

goldsteinn added 2 commits March 8, 2024 17:13

[InstCombine] Add more tests for transforming (binop (uitofp), -C);…

4fbaa8c

… NFC

goldsteinn force-pushed the goldsteinn/fp-int-binop-sign-agnostic branch from e555b79 to 72794f0 Compare March 8, 2024 23:13

nikic approved these changes Mar 9, 2024

View reviewed changes

goldsteinn closed this in 8d976c7 Mar 9, 2024

goldsteinn mentioned this pull request Mar 9, 2024

[InstCombine] Scalarize (vec_ops (insert ?, X, Idx)) when only one element is demanded #84645

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs #84389

[InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs #84389

goldsteinn commented Mar 7, 2024 •

edited

llvmbot commented Mar 7, 2024

dtcxzyw commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

dtcxzyw left a comment

nikic Mar 9, 2024

nikic Mar 9, 2024

nikic Mar 9, 2024

	// NB: This only comes up is OpsFromSigned is true, so there is no need to
	// NB: This only comes up if OpsFromSigned is true, so there is no need to

	// cache is between calls to `foldFBinOpOfIntCastsFromSign`.
	// cache it between calls to `foldFBinOpOfIntCastsFromSign`.

	// `isKnownNonNegative`, change this to an explicitly call.
	// `isKnownNonNegative`, change this to an explicit call.

[InstCombine] Make (binop ({s|u}itofp),({s|u}itofp)) transform more flexible to mismatched signs #84389

[InstCombine] Make (binop ({s|u}itofp),({s|u}itofp)) transform more flexible to mismatched signs #84389

Conversation

goldsteinn commented Mar 7, 2024 • edited

llvmbot commented Mar 7, 2024

dtcxzyw commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

goldsteinn commented Mar 8, 2024

dtcxzyw left a comment

Choose a reason for hiding this comment

nikic Mar 9, 2024

Choose a reason for hiding this comment

nikic Mar 9, 2024

Choose a reason for hiding this comment

nikic Mar 9, 2024

Choose a reason for hiding this comment

[InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs #84389

[InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs #84389

goldsteinn commented Mar 7, 2024 •

edited