[InstCombine] Canonicalize `(sitofp x)` -> `(uitofp x)` if `x >= 0` #82404

goldsteinn · 2024-02-20T19:02:52Z

Just a standard canonicalization.

Proofs: https://alive2.llvm.org/ce/z/9W4VFm

llvmbot · 2024-02-20T19:03:22Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-llvm-transforms

Author: None (goldsteinn)

Changes

[InstCombine] Add simple test for value of canonicalizing uitofp -> sitofp; NFC
[InstCombine] Canonicalize (uitofp x) -> (sitofp x) if x >= 0

Full diff: https://github.com/llvm/llvm-project/pull/82404.diff

5 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp (+5-1)
(modified) llvm/test/Transforms/InstCombine/add-sitofp.ll (+19)
(modified) llvm/test/Transforms/InstCombine/clamp-to-minmax.ll (+2-2)
(modified) llvm/test/Transforms/InstCombine/fpcast.ll (+6-6)
(modified) llvm/test/Transforms/InstCombine/sitofp.ll (+1-1)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ed47de287302ed..c22f1d8561ebbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1929,7 +1929,11 @@ Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
 }
 
 Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
-  return commonCastTransforms(CI);
+  if (Instruction *R = commonCastTransforms(CI))
+    return R;
+  if (isKnownNonNegative(CI.getOperand(0), SQ))
+    return new SIToFPInst(CI.getOperand(0), CI.getType());
+  return nullptr;
 }
 
 Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index db44b806593b64..0b25f7acb085ba 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -63,6 +63,25 @@ define double @test_2(i32 %a, i32 %b) {
   ret double %res
 }
 
+define double @test_2_uitofp(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_2_uitofp(
+; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
+; CHECK-NEXT:    [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823
+; CHECK-NEXT:    [[ADDCONV:%.*]] = add nuw nsw i32 [[A_AND]], [[B_AND]]
+; CHECK-NEXT:    [[RES:%.*]] = sitofp i32 [[ADDCONV]] to double
+; CHECK-NEXT:    ret double [[RES]]
+;
+  ; Drop two highest bits to guarantee that %a + %b doesn't overflow
+  %a_and = and i32 %a, 1073741823
+  %b_and = and i32 %b, 1073741823
+
+  %a_and_fp = uitofp i32 %a_and to double
+  %b_and_fp = uitofp i32 %b_and to double
+
+  %res = fadd double %a_and_fp, %b_and_fp
+  ret double %res
+}
+
 define float @test_2_neg(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test_2_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index fad1176cc18fac..d997ab2e24bc5b 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -472,7 +472,7 @@ define float @ui32_clamp_and_cast_to_float(i32 %x) {
 ; CHECK-LABEL: @ui32_clamp_and_cast_to_float(
 ; CHECK-NEXT:    [[LO_CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[MIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 255)
-; CHECK-NEXT:    [[MIN:%.*]] = uitofp i32 [[MIN1]] to float
+; CHECK-NEXT:    [[MIN:%.*]] = sitofp i32 [[MIN1]] to float
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
@@ -488,7 +488,7 @@ define float @ui64_clamp_and_cast_to_float(i64 %x) {
 ; CHECK-LABEL: @ui64_clamp_and_cast_to_float(
 ; CHECK-NEXT:    [[LO_CMP:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    [[MIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[X]], i64 255)
-; CHECK-NEXT:    [[MIN:%.*]] = uitofp i64 [[MIN1]] to float
+; CHECK-NEXT:    [[MIN:%.*]] = sitofp i64 [[MIN1]] to float
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index 3e5c6fd20b12da..ca47f068b4860f 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -266,7 +266,7 @@ define half @uint_to_fptrunc(i32 %x) {
 define half @masked_uint_to_fptrunc1(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -278,7 +278,7 @@ define half @masked_uint_to_fptrunc1(i32 %x) {
 define half @masked_uint_to_fptrunc2(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -290,7 +290,7 @@ define half @masked_uint_to_fptrunc2(i32 %x) {
 define half @masked_uint_to_fptrunc3(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fptrunc float [[F]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -314,7 +314,7 @@ define double @uint_to_fpext(i32 %x) {
 define double @masked_uint_to_fpext1(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -326,7 +326,7 @@ define double @masked_uint_to_fpext1(i32 %x) {
 define double @masked_uint_to_fpext2(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -338,7 +338,7 @@ define double @masked_uint_to_fpext2(i32 %x) {
 define double @masked_uint_to_fpext3(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fpext float [[F]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/sitofp.ll b/llvm/test/Transforms/InstCombine/sitofp.ll
index 5e0cf944880071..086323624a2073 100644
--- a/llvm/test/Transforms/InstCombine/sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/sitofp.ll
@@ -256,7 +256,7 @@ define i25 @consider_lowbits_masked_input(i25 %A) {
 define i32 @overflow_masked_input(i32 %A) {
 ; CHECK-LABEL: @overflow_masked_input(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[A:%.*]], 16777217
-; CHECK-NEXT:    [[B:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[B:%.*]] = sitofp i32 [[M]] to float
 ; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
 ; CHECK-NEXT:    ret i32 [[C]]
 ;

goldsteinn · 2024-02-20T19:04:38Z

My thinking was sitofp is a better canonicalization, but either way we should probably pick one.

nikic · 2024-02-20T19:33:31Z

My thinking was sitofp is a better canonicalization, but either way we should probably pick one.

Why is it a better canonicalization? We prefer unsigned over signed operations for everything else (zext, lshr, udiv, icmp u), so this would be a very unusual.

goldsteinn · 2024-02-20T19:39:53Z

My thinking was sitofp is a better canonicalization, but either way we should probably pick one.

Why is it a better canonicalization? We prefer unsigned over signed operations for everything else (zext, lshr, udiv, icmp u), so this would be a very unusual.

There seems to be a lot more support for looking through sitofp than uitofp (infact saw no cases where we support uitofp but not sitofp in InstCombinAddSub/InstCombineMulDiv.

Edit: The little test I added is an example.

Also fp types are pretty inherently signed, so think signed -> signed as more intuitive/direct.

nikic · 2024-02-20T19:45:13Z

Edit: The little test I added is an example.

Is there some inherent reason why that transform can only work with sitofp, rather than also uitofp with an s/signed/unsigned replacement everywhere? If there isn't, we should extend that transform instead.

goldsteinn · 2024-02-20T19:50:40Z

Edit: The little test I added is an example.

Is there some inherent reason why that transform can only work with sitofp, rather than also uitofp with an s/signed/unsigned replacement everywhere? If there isn't, we should extend that transform instead.
I think it works for both.

One more thing, some backend (x86 for example) have better codegen with si than ui (not sure of any counter cases).
https://godbolt.org/z/f5joos63o

and its not too uncommon for info from middle-end to be lost on the way to backend.

Can be fixed up elsewhere (i.e during lowering) but think generally its indicative of the somewhat builtin "signedness" of floats and thus how si is a better match.

If youre not convinced lmk and ill swap this patch around and start work on the regressions.

PR Link: llvm/llvm-project#82404

dtcxzyw · 2024-02-21T00:57:51Z

Edit: The little test I added is an example.

Is there some inherent reason why that transform can only work with sitofp, rather than also uitofp with an s/signed/unsigned replacement everywhere? If there isn't, we should extend that transform instead.

An example extracted from cpython/_ctypes_test.ll: https://godbolt.org/z/jG6WWPqnE

goldsteinn · 2024-02-21T01:22:29Z

Edit: The little test I added is an example.

Is there some inherent reason why that transform can only work with sitofp, rather than also uitofp with an s/signed/unsigned replacement everywhere? If there isn't, we should extend that transform instead.

An example extracted from cpython/_ctypes_test.ll: https://godbolt.org/z/jG6WWPqnE

I'm working on a patch to expand that fold.

nikic · 2024-02-21T10:21:41Z

Can be fixed up elsewhere (i.e during lowering) but think generally its indicative of the somewhat builtin "signedness" of floats and thus how si is a better match.

I think this says a lot more about the x86 architecture than about floating point numbers :) Pre-avx512 is well known to have very weird holes in the simd instruction set -- you could equally say that integers have an inherent preference for signed comparison because x86 can't compare packed unsigned numbers...

If youre not convinced lmk and ill swap this patch around and start work on the regressions.

I think we should first address any unnecessary differences between sitofp and uitofp optimization, and then we can check the result of both canonicalization directions on @dtcxzyw's test set and see whether we see any clear benefit in one direction or the other.

topperc · 2024-02-21T18:47:16Z

Can be fixed up elsewhere (i.e during lowering) but think generally its indicative of the somewhat builtin "signedness" of floats and thus how si is a better match.

I think this says a lot more about the x86 architecture than about floating point numbers :) Pre-avx512 is well known to have very weird holes in the simd instruction set -- you could equally say that integers have an inherent preference for signed comparison because x86 can't compare packed unsigned numbers...

I think there's an asymmetry somewhere in PowerPC too. But it's for the other direction fptosi/fptoui. See this comment from DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT

  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is     
  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT    
  // and SINT conversions are Custom, there is no way to tell which is           
  // preferable. We choose SINT because that's the right thing on PPC.)

goldsteinn · 2024-02-22T00:22:10Z

See #82555,
once that gets in ill revise this patch and repost.

goldsteinn · 2024-03-06T20:43:14Z

got #82555 in, now canonicalizes to unsigned.

PR Link: llvm/llvm-project#82404

nikic · 2024-03-07T08:45:54Z

@dtcxzyw Can you please test the new version?

dtcxzyw · 2024-03-07T09:34:51Z

@dtcxzyw Can you please test the new version?

Done.

dtcxzyw · 2024-03-07T09:54:30Z

Regression:

define float @src(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %sub = uitofp i32 %and.i to float
  %add = fadd float %sub, -16383.0
  ret float %add
}

define float @tgt(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %addconv = add nsw i32 %and.i, -16383
  %sub = sitofp i32 %addconv to float
  ret float %sub
}

Alive2: https://alive2.llvm.org/ce/z/W6Ik5C

goldsteinn · 2024-03-07T16:53:34Z

Regression:

define float @src(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %sub = uitofp i32 %and.i to float
  %add = fadd float %sub, -16383.0
  ret float %add
}

define float @tgt(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %addconv = add nsw i32 %and.i, -16383
  %sub = sitofp i32 %addconv to float
  ret float %sub
}

Alive2: https://alive2.llvm.org/ce/z/W6Ik5C

Is that a regression?
Seems like the fadd -> add is desirable.
Likewise we don't really know return op sign in either.

dtcxzyw · 2024-03-07T17:03:40Z

Regression:
define float @src(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %sub = uitofp i32 %and.i to float
  %add = fadd float %sub, -16383.0
  ret float %add
}

define float @tgt(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %addconv = add nsw i32 %and.i, -16383
  %sub = sitofp i32 %addconv to float
  ret float %sub
}
Alive2: https://alive2.llvm.org/ce/z/W6Ik5C
Is that a regression? Seems like the fadd -> add is desirable. Likewise we don't really know return op sign in either.

dtcxzyw/llvm-opt-benchmark#248 (comment)
But this patch transforms add + sitofp into uitofp + fadd :(

goldsteinn · 2024-03-07T17:26:20Z

Regression:
define float @src(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %sub = uitofp i32 %and.i to float
  %add = fadd float %sub, -16383.0
  ret float %add
}

define float @tgt(i32 %shr.i) {
  %and.i = and i32 %shr.i, 32767
  %addconv = add nsw i32 %and.i, -16383
  %sub = sitofp i32 %addconv to float
  ret float %sub
}
Alive2: https://alive2.llvm.org/ce/z/W6Ik5C
Is that a regression? Seems like the fadd -> add is desirable. Likewise we don't really know return op sign in either.
dtcxzyw/llvm-opt-benchmark#248 (comment) But this patch transforms add + sitofp into uitofp + fadd :(

Ah I see, thought you meant the other way around.

Okay, think what needs to change is need to be able to handle uitofp as sitofp
in foldFBinOpOfIntCasts. Ill make a patch.

goldsteinn · 2024-03-07T22:22:04Z

define float @src(i32 %shr.i) {
%and.i = and i32 %shr.i, 32767
%sub = uitofp i32 %and.i to float
%add = fadd float %sub, -16383.0
ret float %add
}

See: #84389

goldsteinn · 2024-03-09T20:49:58Z

@dtcxzyw can you re-run?

asmok-g · 2024-03-19T23:51:21Z

Heads up: we noticed at google that this is causing the following test to fail:

https://github.com/google/swiftshader/blob/master/tests/ReactorUnitTests/ReactorUnitTests.cpp#L1312

I need to put a more proper reproducer, but thought that at least posting the heads-up might be faster to unblock us and maybe the problem is clear. I see that adding a flag vs reverting is already brought up in the last comment, what's the plan for that ?

goldsteinn · 2024-03-20T00:01:23Z

Ill revert this. I'll re-post if I get around to adding a flag.
Any chance you can still create a repro of the bug your seeing for if I re-post?

goldsteinn · 2024-03-20T05:51:51Z

Heads up: we noticed at google that this is causing the following test to fail:

https://github.com/google/swiftshader/blob/master/tests/ReactorUnitTests/ReactorUnitTests.cpp#L1312

I need to put a more proper reproducer, but thought that at least posting the heads-up might be faster to unblock us and maybe the problem is clear. I see that adding a flag vs reverting is already brought up in the last comment, what's the plan for that ?

Reverted with: 6960ace

If you can get the repro though, that would still be useful for when I revisit.

alexfh · 2024-03-20T15:39:07Z

Apart from the correctness issues, we've seen some regressions on various benchmarks from LLVM Test Suite after this patch. Specifically, around 3-5% regression on x86-64 in various metrics of the Interpolation benchmarks, and up to 30% regression on a number of floating point-centric benchmarks from https://github.com/llvm/llvm-test-suite/tree/main/SingleSource/Benchmarks/Misc (flops-4.c, flops-5.c, flops-6.c, flops-8.c, fp-convert.c). The numbers vary depending on the microarchitecture, with Skylake being less affected (on the order of ~10%) and AMD Rome showing larger regressions (up to 30%).

goldsteinn · 2024-03-20T17:17:32Z

Apart from the correctness issues, we've seen some regressions on various benchmarks from LLVM Test Suite after this patch. Specifically, around 3-5% regression on x86-64 in various metrics of the Interpolation benchmarks, and up to 30% regression on a number of floating point-centric benchmarks from https://github.com/llvm/llvm-test-suite/tree/main/SingleSource/Benchmarks/Misc (flops-4.c, flops-5.c, flops-6.c, flops-8.c, fp-convert.c). The numbers vary depending on the microarchitecture, with Skylake being less affected (on the order of ~10%) and AMD Rome showing larger regressions (up to 30%).

Thank you for the info, well its reverted now so nothing todo. Although that does motivate me to get
re-post with the nneg flag support as it sounds like getting the right cast can be important
and that will allow us to do a better job than we currently are.

asmok-g · 2024-03-20T17:56:52Z

Well, I'm not sure how proper that wold be as a reproducer,

I extracted the mentioned test to a program:

#include <cmath>

#include "third_party/swiftshader/src/Reactor/Coroutine.hpp"
#include "third_party/swiftshader/src/Reactor/Reactor.hpp"

using float4 = float[4];
using int4 = int[4];

static std::string testName() { return "test name"; }

using namespace rr;

int main() {
  Function<Void(Pointer<Int4>, Pointer<Int4>)> function;
  {
    Pointer<Int4> x = function.Arg<0>();
    Pointer<Int4> y = function.Arg<1>();

    *y = Abs(*x);
  }

  auto routine = function(testName().c_str());
  auto callable = (void (*)(int4 *, int4 *))routine->getEntry();

  int input[] = {1, -1, 0, (int)0x80000000};

  for (int x : input) {
    int4 v_in = {x, x, x, x};
    int4 v_out = {0};

    callable(&v_in, &v_out);

    float expected = abs(x);
    if (v_out[0] != expected) {
      return 10;
    }
  }
}

Before the change program exits with 0, after it exists with 10.
More precisely: v_out[0] is: -2147483648, expected is: 2.14748365e+09

out_at.txt
out_before.txt

I attached the generated IR before and after the change. It's only one line diff:

   call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %9, i8 0, i64 16, i1 false)
   call void %49(ptr noundef nonnull %8, ptr noundef nonnull %9) #16
   %61 = call i32 @llvm.abs.i32(i32 %60, i1 true)
-  %62 = sitofp i32 %61 to float
+  %62 = uitofp i32 %61 to float
   %63 = load i32, ptr %9, align 16
   %64 = sitofp i32 %63 to float
   %65 = fcmp une float %64, %62

generation command is clang -O1 -fsized-deallocation '-std=gnu++20' -c pre.ii -emit-llvm -S -o /tmp/ir_before.ll

I'm trying to reduce the preprocessed file, tho not sure i'll keep the semantics of the failure.

goldsteinn · 2024-03-20T18:56:44Z

Well, I'm not sure how proper that wold be as a reproducer,

I extracted the mentioned test to a program:
#include <cmath>

#include "third_party/swiftshader/src/Reactor/Coroutine.hpp"
#include "third_party/swiftshader/src/Reactor/Reactor.hpp"

using float4 = float[4];
using int4 = int[4];

static std::string testName() { return "test name"; }

using namespace rr;

int main() {
  Function<Void(Pointer<Int4>, Pointer<Int4>)> function;
  {
    Pointer<Int4> x = function.Arg<0>();
    Pointer<Int4> y = function.Arg<1>();

    *y = Abs(*x);
  }

  auto routine = function(testName().c_str());
  auto callable = (void (*)(int4 *, int4 *))routine->getEntry();

  int input[] = {1, -1, 0, (int)0x80000000};

  for (int x : input) {
    int4 v_in = {x, x, x, x};
    int4 v_out = {0};

    callable(&v_in, &v_out);

    float expected = abs(x);
    if (v_out[0] != expected) {
      return 10;
    }
  }
}
Before the change program exits with 0, after it exists with 10. More precisely: v_out[0] is: -2147483648, expected is: 2.14748365e+09

out_at.txt out_before.txt

I attached the generated IR before and after the change. It's only one line diff:
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %9, i8 0, i64 16, i1 false)
   call void %49(ptr noundef nonnull %8, ptr noundef nonnull %9) #16
   %61 = call i32 @llvm.abs.i32(i32 %60, i1 true)
-  %62 = sitofp i32 %61 to float
+  %62 = uitofp i32 %61 to float
   %63 = load i32, ptr %9, align 16
   %64 = sitofp i32 %63 to float
   %65 = fcmp une float %64, %62
generation command is clang -O1 -fsized-deallocation '-std=gnu++20' -c pre.ii -emit-llvm -S -o /tmp/ir_before.ll

I'm trying to reduce the preprocessed file, tho not sure i'll keep the semantics of the failure.

This should be sufficient, thank you!

dtcxzyw · 2024-03-21T06:00:30Z

Apart from the correctness issues, we've seen some regressions on various benchmarks from LLVM Test Suite after this patch. Specifically, around 3-5% regression on x86-64 in various metrics of the Interpolation benchmarks, and up to 30% regression on a number of floating point-centric benchmarks from https://github.com/llvm/llvm-test-suite/tree/main/SingleSource/Benchmarks/Misc (flops-4.c, flops-5.c, flops-6.c, flops-8.c, fp-convert.c). The numbers vary depending on the microarchitecture, with Skylake being less affected (on the order of ~10%) and AMD Rome showing larger regressions (up to 30%).

FYI this patch saves ~3% instructions for some benchmarks from LLVM-test-suite on RISC-V.
dtcxzyw/llvm-ci#1115
dtcxzyw/llvm-ci#1114

topperc · 2024-03-21T07:14:07Z

Apart from the correctness issues, we've seen some regressions on various benchmarks from LLVM Test Suite after this patch. Specifically, around 3-5% regression on x86-64 in various metrics of the Interpolation benchmarks, and up to 30% regression on a number of floating point-centric benchmarks from https://github.com/llvm/llvm-test-suite/tree/main/SingleSource/Benchmarks/Misc (flops-4.c, flops-5.c, flops-6.c, flops-8.c, fp-convert.c). The numbers vary depending on the microarchitecture, with Skylake being less affected (on the order of ~10%) and AMD Rome showing larger regressions (up to 30%).

FYI this patch saves ~3% instructions for some benchmarks from LLVM-test-suite on RISC-V. dtcxzyw/llvm-ci#1115 dtcxzyw/llvm-ci#1114

Are you able to extract a reproducer that I can look at?

goldsteinn · 2024-03-21T15:55:17Z

See: #86141 which adds nneg flag to IR. Once that gets in I have patches for instcombine,cvp,sccp to do the transform.

As noted when llvm#82404 was pushed (canonicalizing `sitofp` -> `uitofp`), different signedness on fp casts can have dramatic performance implications on different backends. So, it makes to create a reliable means for the backend to pick its cast signedness if either are correct. Further, this allows us to start canonicalizing `sitofp`- > `uitofp` which may easy middle end analysis.

goldsteinn · 2024-03-21T16:51:30Z

Well, I'm not sure how proper that wold be as a reproducer,
I extracted the mentioned test to a program:
#include <cmath>

#include "third_party/swiftshader/src/Reactor/Coroutine.hpp"
#include "third_party/swiftshader/src/Reactor/Reactor.hpp"

using float4 = float[4];
using int4 = int[4];

static std::string testName() { return "test name"; }

using namespace rr;

int main() {
  Function<Void(Pointer<Int4>, Pointer<Int4>)> function;
  {
    Pointer<Int4> x = function.Arg<0>();
    Pointer<Int4> y = function.Arg<1>();

    *y = Abs(*x);
  }

  auto routine = function(testName().c_str());
  auto callable = (void (*)(int4 *, int4 *))routine->getEntry();

  int input[] = {1, -1, 0, (int)0x80000000};

  for (int x : input) {
    int4 v_in = {x, x, x, x};
    int4 v_out = {0};

    callable(&v_in, &v_out);

    float expected = abs(x);
    if (v_out[0] != expected) {
      return 10;
    }
  }
}
Before the change program exits with 0, after it exists with 10. More precisely: v_out[0] is: -2147483648, expected is: 2.14748365e+09
out_at.txt out_before.txt
I attached the generated IR before and after the change. It's only one line diff:
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %9, i8 0, i64 16, i1 false)
   call void %49(ptr noundef nonnull %8, ptr noundef nonnull %9) #16
   %61 = call i32 @llvm.abs.i32(i32 %60, i1 true)
-  %62 = sitofp i32 %61 to float
+  %62 = uitofp i32 %61 to float
   %63 = load i32, ptr %9, align 16
   %64 = sitofp i32 %63 to float
   %65 = fcmp une float %64, %62
generation command is clang -O1 -fsized-deallocation '-std=gnu++20' -c pre.ii -emit-llvm -S -o /tmp/ir_before.ll
I'm trying to reduce the preprocessed file, tho not sure i'll keep the semantics of the failure.
This should be sufficient, thank you!

Might have been related to: 1283646

This is essentially the same as llvm#82404 but has the `nneg` flag which allows the backend to reliably undo the transform.

topperc · 2024-03-21T17:04:50Z

Well, I'm not sure how proper that wold be as a reproducer,
I extracted the mentioned test to a program:
#include <cmath>

#include "third_party/swiftshader/src/Reactor/Coroutine.hpp"
#include "third_party/swiftshader/src/Reactor/Reactor.hpp"

using float4 = float[4];
using int4 = int[4];

static std::string testName() { return "test name"; }

using namespace rr;

int main() {
  Function<Void(Pointer<Int4>, Pointer<Int4>)> function;
  {
    Pointer<Int4> x = function.Arg<0>();
    Pointer<Int4> y = function.Arg<1>();

    *y = Abs(*x);
  }

  auto routine = function(testName().c_str());
  auto callable = (void (*)(int4 *, int4 *))routine->getEntry();

  int input[] = {1, -1, 0, (int)0x80000000};

  for (int x : input) {
    int4 v_in = {x, x, x, x};
    int4 v_out = {0};

    callable(&v_in, &v_out);

    float expected = abs(x);
    if (v_out[0] != expected) {
      return 10;
    }
  }
}
Before the change program exits with 0, after it exists with 10. More precisely: v_out[0] is: -2147483648, expected is: 2.14748365e+09
out_at.txt out_before.txt
I attached the generated IR before and after the change. It's only one line diff:
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %9, i8 0, i64 16, i1 false)
   call void %49(ptr noundef nonnull %8, ptr noundef nonnull %9) #16
   %61 = call i32 @llvm.abs.i32(i32 %60, i1 true)
-  %62 = sitofp i32 %61 to float
+  %62 = uitofp i32 %61 to float
   %63 = load i32, ptr %9, align 16
   %64 = sitofp i32 %63 to float
   %65 = fcmp une float %64, %62
generation command is clang -O1 -fsized-deallocation '-std=gnu++20' -c pre.ii -emit-llvm -S -o /tmp/ir_before.ll
I'm trying to reduce the preprocessed file, tho not sure i'll keep the semantics of the failure.
This should be sufficient, thank you!
Might have been related to: 1283646

Doesn't that program call abs on 0x80000000 which is poison?

As noted when llvm#82404 was pushed (canonicalizing `sitofp` -> `uitofp`), different signedness on fp casts can have dramatic performance implications on different backends. So, it makes to create a reliable means for the backend to pick its cast signedness if either are correct. Further, this allows us to start canonicalizing `sitofp`- > `uitofp` which may easy middle end analysis.

As noted when #82404 was pushed (canonicalizing `sitofp` -> `uitofp`), different signedness on fp casts can have dramatic performance implications on different backends. So, it makes to create a reliable means for the backend to pick its cast signedness if either are correct. Further, this allows us to start canonicalizing `sitofp`- > `uitofp` which may easy middle end analysis. Closes #86141

This is essentially the same as llvm#82404 but has the `nneg` flag which allows the backend to reliably undo the transform.

This is essentially the same as #82404 but has the `nneg` flag which allows the backend to reliably undo the transform. Closes #88299

goldsteinn requested a review from nikic as a code owner February 20, 2024 19:02

llvmbot added the llvm:transforms label Feb 20, 2024

goldsteinn requested a review from dtcxzyw February 20, 2024 19:03

goldsteinn changed the title ~~goldsteinn/ui to si~~ [InstCombine] Canonicalize (uitofp x) -> (sitofp x) if x >= 0 Feb 20, 2024

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Feb 21, 2024

pre-commit: test PR82404

95945e9

PR Link: llvm/llvm-project#82404

dtcxzyw mentioned this pull request Feb 21, 2024

pre-commit: test PR82404 dtcxzyw/llvm-opt-benchmark#248

Closed

goldsteinn force-pushed the goldsteinn/ui-to-si branch from 61808ef to 4a2a0aa Compare March 6, 2024 20:39

goldsteinn changed the title ~~[InstCombine] Canonicalize (uitofp x) -> (sitofp x) if x >= 0~~ [InstCombine] Canonicalize (sitofp x) -> (uitofp x) if x >= 0 Mar 6, 2024

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Mar 7, 2024

pre-commit: test PR82404

347b8f6

PR Link: llvm/llvm-project#82404

goldsteinn force-pushed the goldsteinn/ui-to-si branch from 4a2a0aa to cb0ccc6 Compare March 9, 2024 19:27

goldsteinn mentioned this pull request Mar 21, 2024

[IR] Add support for nneg flag with uitofp #86141

Closed

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Mar 21, 2024

[InstCombine] Add canonicalization of sitofp -> uitofp nneg

fb423d3

This is essentially the same as llvm#82404 but has the `nneg` flag which allows the backend to reliably undo the transform.

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Apr 10, 2024

[InstCombine] Add canonicalization of sitofp -> uitofp nneg

ea9f339

This is essentially the same as llvm#82404 but has the `nneg` flag which allows the backend to reliably undo the transform.

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Apr 10, 2024

[InstCombine] Add canonicalization of sitofp -> uitofp nneg

46f7665

This is essentially the same as llvm#82404 but has the `nneg` flag which allows the backend to reliably undo the transform.

goldsteinn mentioned this pull request Apr 10, 2024

[InstCombine] Add canonicalization of sitofp -> uitofp nneg #88299

Closed

goldsteinn added a commit that referenced this pull request Apr 16, 2024

[InstCombine] Add canonicalization of sitofp -> uitofp nneg

b6bd41d

This is essentially the same as #82404 but has the `nneg` flag which allows the backend to reliably undo the transform. Closes #88299

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[InstCombine] Canonicalize `(sitofp x)` -> `(uitofp x)` if `x >= 0` #82404

[InstCombine] Canonicalize `(sitofp x)` -> `(uitofp x)` if `x >= 0` #82404

goldsteinn commented Feb 20, 2024 •

edited

llvmbot commented Feb 20, 2024 •

edited

goldsteinn commented Feb 20, 2024

nikic commented Feb 20, 2024

goldsteinn commented Feb 20, 2024 •

edited

nikic commented Feb 20, 2024

goldsteinn commented Feb 20, 2024

dtcxzyw commented Feb 21, 2024

goldsteinn commented Feb 21, 2024

nikic commented Feb 21, 2024

topperc commented Feb 21, 2024 •

edited

goldsteinn commented Feb 22, 2024

goldsteinn commented Mar 6, 2024

nikic commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

goldsteinn commented Mar 9, 2024

asmok-g commented Mar 19, 2024

goldsteinn commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

alexfh commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

asmok-g commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

dtcxzyw commented Mar 21, 2024

topperc commented Mar 21, 2024

goldsteinn commented Mar 21, 2024

goldsteinn commented Mar 21, 2024

topperc commented Mar 21, 2024

[InstCombine] Canonicalize (sitofp x) -> (uitofp x) if x >= 0 #82404

[InstCombine] Canonicalize (sitofp x) -> (uitofp x) if x >= 0 #82404

Conversation

goldsteinn commented Feb 20, 2024 • edited

llvmbot commented Feb 20, 2024 • edited

goldsteinn commented Feb 20, 2024

nikic commented Feb 20, 2024

goldsteinn commented Feb 20, 2024 • edited

nikic commented Feb 20, 2024

goldsteinn commented Feb 20, 2024

dtcxzyw commented Feb 21, 2024

goldsteinn commented Feb 21, 2024

nikic commented Feb 21, 2024

topperc commented Feb 21, 2024 • edited

goldsteinn commented Feb 22, 2024

goldsteinn commented Mar 6, 2024

nikic commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

dtcxzyw commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

goldsteinn commented Mar 7, 2024

goldsteinn commented Mar 9, 2024

asmok-g commented Mar 19, 2024

goldsteinn commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

alexfh commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

asmok-g commented Mar 20, 2024

goldsteinn commented Mar 20, 2024

dtcxzyw commented Mar 21, 2024

topperc commented Mar 21, 2024

goldsteinn commented Mar 21, 2024

goldsteinn commented Mar 21, 2024

topperc commented Mar 21, 2024

[InstCombine] Canonicalize `(sitofp x)` -> `(uitofp x)` if `x >= 0` #82404

[InstCombine] Canonicalize `(sitofp x)` -> `(uitofp x)` if `x >= 0` #82404

goldsteinn commented Feb 20, 2024 •

edited

llvmbot commented Feb 20, 2024 •

edited

goldsteinn commented Feb 20, 2024 •

edited

topperc commented Feb 21, 2024 •

edited