[APFloat] Fix truncation of certain subnormal numbers

Certain subnormals would be incorrectly rounded away from zero. Fixes #55838 Differential Revision: https://reviews.llvm.org/D127140
llvm · Jun 8, 2022 · ed6c309 · ed6c309
1 parent d897a14
commit ed6c309
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 10 deletions.
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
@@ -2213,15 +2213,22 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
   // when truncating from PowerPC double-double to double format), the
   // right shift could lose result mantissa bits.  Adjust exponent instead
   // of performing excessive shift.
+  // Also do a similar trick in case shifting denormal would produce zero
+  // significand as this case isn't handled correctly by normalize.
   if (shift < 0 && isFiniteNonZero()) {
-    int exponentChange = significandMSB() + 1 - fromSemantics.precision;
+    int omsb = significandMSB() + 1;
+    int exponentChange = omsb - fromSemantics.precision;
     if (exponent + exponentChange < toSemantics.minExponent)
       exponentChange = toSemantics.minExponent - exponent;
     if (exponentChange < shift)
       exponentChange = shift;
     if (exponentChange < 0) {
       shift -= exponentChange;
       exponent += exponentChange;
+    } else if (omsb <= -shift) {
+      exponentChange = omsb + shift - 1; // leave at least one bit set
+      shift -= exponentChange;
+      exponent += exponentChange;
     }
   }
 

diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
@@ -79,21 +79,17 @@ define float @trunc_denorm_lost_fraction0() {
   ret float %b
 }
 
-; FIXME: This should be 0.0.
-
 define float @trunc_denorm_lost_fraction1() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction1(
-; CHECK-NEXT:    ret float 0x36A0000000000000
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %b = fptrunc double 0x0000000010000001 to float
   ret float %b
 }
 
-; FIXME: This should be 0.0.
-
 define float @trunc_denorm_lost_fraction2() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction2(
-; CHECK-NEXT:    ret float 0x36A0000000000000
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %b = fptrunc double 0x000000001fffffff to float
   ret float %b
@@ -107,11 +103,9 @@ define float @trunc_denorm_lost_fraction3() {
   ret float %b
 }
 
-; FIXME: This should be -0.0.
-
 define float @trunc_denorm_lost_fraction4() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction4(
-; CHECK-NEXT:    ret float 0xB6A0000000000000
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %b = fptrunc double 0x8000000010000001 to float
   ret float %b

diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1859,6 +1859,48 @@ TEST(APFloatTest, convert) {
   EXPECT_EQ(0x7fc00000, test.bitcastToAPInt());
   EXPECT_TRUE(losesInfo);
   EXPECT_EQ(status, APFloat::opOK);
+
+  // Test that subnormals are handled correctly in double to float conversion
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000010000000p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000010000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "-0x0.0000010000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000020000000p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000020000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  // Test subnormal conversion to bfloat
+  test = APFloat(APFloat::IEEEsingle(), "0x0.01p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEsingle(), "0x0.02p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0x01, test.bitcastToAPInt());
+  EXPECT_FALSE(losesInfo);
+
+  test = APFloat(APFloat::IEEEsingle(), "0x0.01p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToAway, &losesInfo);
+  EXPECT_EQ(0x01, test.bitcastToAPInt());
+  EXPECT_TRUE(losesInfo);
 }
 
 TEST(APFloatTest, PPCDoubleDouble) {