WIP

pitrou · Jun 6, 2023 · 9ea5dd6 · 9ea5dd6
1 parent 7f8ccb5
commit 9ea5dd6
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 43 deletions.
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
@@ -75,6 +75,11 @@ static constexpr uint64_t kUInt64PowersOfTen[kInt64DecimalDigits + 1] = {
     // clang-format on
 };
 
+// Attention: these pre-computed constants might not exactly represent their
+// decimal counterparts:
+//   >>> int(1e38)
+//   99999999999999997748809823456034029568
+
 static constexpr float kFloatPowersOfTen[2 * 38 + 1] = {
     1e-38f, 1e-37f, 1e-36f, 1e-35f, 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f,
     1e-28f, 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f, 1e-20f, 1e-19f,

diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc
@@ -753,7 +753,7 @@ TEST_P(Decimal128ToStringTest, ToString) {
   const ToStringTestParam& param = GetParam();
   const Decimal128 value(param.test_value);
   const std::string printed_value = value.ToString(param.scale);
-  ASSERT_EQ(param.expected_string, printed_value);
+  EXPECT_EQ(param.expected_string, printed_value);
 }
 
 INSTANTIATE_TEST_SUITE_P(Decimal128ToStringTest, Decimal128ToStringTest,
@@ -763,14 +763,14 @@ template <typename Decimal, typename Real>
 void CheckDecimalFromReal(Real real, int32_t precision, int32_t scale,
                           const std::string& expected) {
   ASSERT_OK_AND_ASSIGN(auto dec, Decimal::FromReal(real, precision, scale));
-  ASSERT_EQ(dec.ToString(scale), expected);
+  EXPECT_EQ(dec.ToString(scale), expected);
 }
 
 template <typename Decimal, typename Real>
 void CheckDecimalFromRealIntegerString(Real real, int32_t precision, int32_t scale,
                                        const std::string& expected) {
   ASSERT_OK_AND_ASSIGN(auto dec, Decimal::FromReal(real, precision, scale));
-  ASSERT_EQ(dec.ToIntegerString(), expected);
+  EXPECT_EQ(dec.ToIntegerString(), expected);
 }
 
 template <typename Real>
@@ -868,24 +868,27 @@ template <typename T>
 class TestDecimalFromRealFloat : public ::testing::Test {
  protected:
   std::vector<FromFloatTestParam> GetValues() {
-    return {// 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
-            FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"},
-            FromFloatTestParam{-9.223373e+18f, 19, 0, "-9223373136366403584"},
-            FromFloatTestParam{9.223373e+14f, 19, 4, "922337313636640.3584"},
-            FromFloatTestParam{-9.223373e+14f, 19, 4, "-922337313636640.3584"},
-            // 2**64 - 2**40 (exactly representable in a float)
-            FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"},
-            FromFloatTestParam{-1.8446743e+19f, 20, 0, "-18446742974197923840"},
-            // 2**64 + 2**41 (exactly representable in a float)
-            FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"},
-            FromFloatTestParam{-1.8446746e+19f, 20, 0, "-18446746272732807168"},
-            FromFloatTestParam{1.8446746e+15f, 20, 4, "1844674627273280.7168"},
-            FromFloatTestParam{-1.8446746e+15f, 20, 4, "-1844674627273280.7168"},
-            // Almost 10**38 (minus 2**103)
-            FromFloatTestParam{9.999999e+37f, 38, 0,
-                               "99999986661652122824821048795547566080"},
-            FromFloatTestParam{-9.999999e+37f, 38, 0,
-                               "-99999986661652122824821048795547566080"}};
+    return {
+        // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision)
+        FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"},
+        FromFloatTestParam{-9.223373e+18f, 19, 0, "-9223373136366403584"},
+        FromFloatTestParam{9.223373e+14f, 19, 4, "922337313636640.3584"},
+        FromFloatTestParam{-9.223373e+14f, 19, 4, "-922337313636640.3584"},
+        // 2**64 - 2**40 (exactly representable in a float)
+        FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"},
+        FromFloatTestParam{-1.8446743e+19f, 20, 0, "-18446742974197923840"},
+        // 2**64 + 2**41 (exactly representable in a float)
+        FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"},
+        FromFloatTestParam{-1.8446746e+19f, 20, 0, "-18446746272732807168"},
+        FromFloatTestParam{1.8446746e+15f, 20, 4, "1844674627273280.7168"},
+        FromFloatTestParam{-1.8446746e+15f, 20, 4, "-1844674627273280.7168"},
+        // Almost 10**38 (minus 2**103)
+        FromFloatTestParam{9.999999e+37f, 38, 0,
+                           "99999986661652122824821048795547566080"},
+        FromFloatTestParam{-9.999999e+37f, 38, 0,
+                           "-99999986661652122824821048795547566080"},
+        // TODO Hand-picked test cases that can involve precision issues
+    };
   }
 };
 TYPED_TEST_SUITE(TestDecimalFromRealFloat, DecimalTypes);
@@ -916,28 +919,44 @@ template <typename T>
 class TestDecimalFromRealDouble : public ::testing::Test {
  protected:
   std::vector<FromDoubleTestParam> GetValues() {
-    return {// 2**63 + 2**11 (exactly representable in a double's 53 bits of precision)
-            FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"},
-            FromDoubleTestParam{-9.223372036854778e+18, 19, 0, "-9223372036854777856"},
-            FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777856"},
-            FromDoubleTestParam{-9.223372036854778e+10, 19, 8, "-92233720368.54777856"},
-            // 2**64 - 2**11 (exactly representable in a double)
-            FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"},
-            FromDoubleTestParam{-1.844674407370955e+19, 20, 0, "-18446744073709549568"},
-            // 2**64 + 2**11 (exactly representable in a double)
-            FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"},
-            FromDoubleTestParam{-1.8446744073709556e+19, 20, 0, "-18446744073709555712"},
-            FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5712"},
-            FromDoubleTestParam{-1.8446744073709556e+15, 20, 4, "-1844674407370955.5712"},
-            // Almost 10**38 (minus 2**73)
-            FromDoubleTestParam{9.999999999999998e+37, 38, 0,
-                                "99999999999999978859343891977453174784"},
-            FromDoubleTestParam{-9.999999999999998e+37, 38, 0,
-                                "-99999999999999978859343891977453174784"},
-            FromDoubleTestParam{9.999999999999998e+27, 38, 10,
-                                "9999999999999997885934389197.7453174784"},
-            FromDoubleTestParam{-9.999999999999998e+27, 38, 10,
-                                "-9999999999999997885934389197.7453174784"}};
+    return {
+        // 2**63 + 2**11 (exactly representable in a double's 53 bits of precision)
+        FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"},
+        FromDoubleTestParam{-9.223372036854778e+18, 19, 0, "-9223372036854777856"},
+        FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777856"},
+        FromDoubleTestParam{-9.223372036854778e+10, 19, 8, "-92233720368.54777856"},
+        // 2**64 - 2**11 (exactly representable in a double)
+        FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"},
+        FromDoubleTestParam{-1.844674407370955e+19, 20, 0, "-18446744073709549568"},
+        // 2**64 + 2**11 (exactly representable in a double)
+        FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"},
+        FromDoubleTestParam{-1.8446744073709556e+19, 20, 0, "-18446744073709555712"},
+        FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5712"},
+        FromDoubleTestParam{-1.8446744073709556e+15, 20, 4, "-1844674407370955.5712"},
+        // Almost 10**38 (minus 2**73)
+        FromDoubleTestParam{9.999999999999998e+37, 38, 0,
+                            "99999999999999978859343891977453174784"},
+        FromDoubleTestParam{-9.999999999999998e+37, 38, 0,
+                            "-99999999999999978859343891977453174784"},
+        FromDoubleTestParam{9.999999999999998e+27, 38, 10,
+                            "9999999999999997885934389197.7453174784"},
+        FromDoubleTestParam{-9.999999999999998e+27, 38, 10,
+                            "-9999999999999997885934389197.7453174784"},
+        // Hand-picked test cases that can involve precision issues.
+        // More comprehensive testing is done in the PyArrow test suite.
+        FromDoubleTestParam{999999999999999.0, 16, 1, "999999999999999.0"},
+        FromDoubleTestParam{-999999999999999.0, 16, 1, "999999999999999.0"},
+        FromDoubleTestParam{9999999999999998.0, 17, 1, "9999999999999998.0"},
+        FromDoubleTestParam{-9999999999999998.0, 17, 1, "-9999999999999998.0"},
+        FromDoubleTestParam{999999999999999.9, 16, 1, "999999999999999.9"},
+        FromDoubleTestParam{-999999999999999.9, 16, 1, "-999999999999999.9"},
+        FromDoubleTestParam{9999999987., 38, 22, "9999999987.0000000000000000000000"},
+        FromDoubleTestParam{-9999999987., 38, 22, "-9999999987.0000000000000000000000"},
+        FromDoubleTestParam{9999999987., 38, 28,
+                            "9999999987.0000000000000000000000000000"},
+        FromDoubleTestParam{-9999999987., 38, 28,
+                            "-9999999987.0000000000000000000000000000"},
+    };
   }
 };
 TYPED_TEST_SUITE(TestDecimalFromRealDouble, DecimalTypes);
@@ -952,15 +971,26 @@ TYPED_TEST(TestDecimalFromRealDouble, SuccessConversion) {
 TYPED_TEST(TestDecimalFromRealDouble, LargeValues) {
   // Test the entire double range
   for (int32_t scale = -308; scale <= 308; ++scale) {
+    ARROW_SCOPED_TRACE("scale = ", scale);
     double real = std::pow(10.0, static_cast<double>(scale));
     CheckDecimalFromRealIntegerString<TypeParam>(real, 1, -scale, "1");
   }
   for (int32_t scale = -307; scale <= 306; ++scale) {
+    ARROW_SCOPED_TRACE("scale = ", scale);
     double real = 123. * std::pow(10.0, static_cast<double>(scale));
     CheckDecimalFromRealIntegerString<TypeParam>(real, 2, -scale - 1, "12");
     CheckDecimalFromRealIntegerString<TypeParam>(real, 3, -scale, "123");
     CheckDecimalFromRealIntegerString<TypeParam>(real, 4, -scale + 1, "1230");
   }
+  for (int32_t scale = -292; scale <= 291; ++scale) {
+    // Exactly 16 decimal digits can fit in a double's mantissa
+    double real = 1234567890123456. * std::pow(10.0, static_cast<double>(scale));
+    ARROW_SCOPED_TRACE("scale = ", scale);
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 15, -scale - 1, "123456789012346");
+    CheckDecimalFromRealIntegerString<TypeParam>(real, 16, -scale, "1234567890123456");
+    // Cannot test precision 17 as the trailing digit depends on FP rounding
+    // during the computation of `real` above.
+  }
 }
 
 // Additional values that only apply to Decimal256

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -15,10 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from collections import namedtuple
 import datetime
+import decimal
 from functools import lru_cache, partial
 import inspect
 import itertools
+import math
 import os
 import pickle
 import pytest
@@ -79,6 +82,8 @@
     pa.float64()
 ]
 
+floating_point_arrow_types = [pa.float32(), pa.float64()]
+
 
 def test_exported_functions():
     # Check that all exported concrete functions can be called with
@@ -1818,6 +1823,83 @@ def test_fsl_to_fsl_cast(value_type):
         fsl.cast(cast_type)
 
 
+DecimalTypeTraits = namedtuple('DecimalTypeTraits',
+                               ('name', 'factory', 'max_precision'))
+
+FloatToDecimalCase = namedtuple('FloatToDecimalCase',
+                                ('precision', 'scale', 'float_val'))
+
+#decimal_type_traits = [DecimalTypeTraits(pa.decimal128, 38),
+                       #DecimalTypeTraits(pa.decimal256, 76)]
+decimal_type_traits = [DecimalTypeTraits('decimal128', pa.decimal128, 38)]
+
+
+def largest_scaled_float_not_above(val, scale):
+    # Find the largest float f such as `f * 10**scale <= val`
+    assert val >= 0
+    float_val = float(val * 10**-scale)
+    if float_val * 10**scale > val:
+        # Take the float just below... it *should* satisfy
+        float_val = math.nextafter(float_val, 0.0)
+        assert float_val * 10**scale <= val
+    #print("val =", val, float_val)
+    return float_val
+
+
+def integral_float_to_decimal_cast_cases(max_precision):
+    for precision in range(1, max_precision):
+        #for scale in range(0, min(10, precision)):
+        for scale in range(0, precision):
+            abs_minval = 1
+            # Exact unscaled limit in the integer domain
+            abs_maxval = largest_scaled_float_not_above(
+                10**precision - 10**scale, scale)
+            for val in (0, abs_minval, -abs_minval,
+                        abs_maxval, -abs_maxval):
+                yield FloatToDecimalCase(precision, scale, float(val))
+
+
+def real_float_to_decimal_cast_cases(max_precision):
+    for precision in range(1, max_precision):
+        for scale in range(0, min(10, precision)):
+        #for scale in range(0, precision):
+            abs_minval = largest_scaled_float_not_above(1, scale)
+            abs_maxval = largest_scaled_float_not_above(
+                10**precision - 1, scale)
+            for val in (0, abs_minval, -abs_minval,
+                        abs_maxval, -abs_maxval):
+                yield FloatToDecimalCase(precision, scale, float(val))
+
+
+#@pytest.mark.parametrize('float_ty', floating_point_arrow_types, ids=str)
+@pytest.mark.parametrize('float_ty', [pa.float64()], ids=str)
+@pytest.mark.parametrize('decimal_ty', decimal_type_traits,
+                         ids=lambda v: v.name)
+@pytest.mark.parametrize('case_generator',
+                         [integral_float_to_decimal_cast_cases,
+                          real_float_to_decimal_cast_cases],
+                         ids=['integrals', 'reals'])
+def test_cast_float_to_decimal(float_ty, decimal_ty, case_generator):
+    with decimal.localcontext() as ctx:
+        for case in case_generator(decimal_ty.max_precision):
+            # Use the Python decimal module to build the expected result
+            # using the right precision
+            ctx.prec = case.precision
+            # XXX which decimal rounding mode is expected??
+            s = pa.scalar(case.float_val, type=float_ty)
+            expected = ctx.create_decimal_from_float(case.float_val)
+            print(f"{case} => expected = {expected!r}")
+            cast_to = decimal_ty.factory(case.precision, case.scale)
+            actual = pc.cast(s, cast_to).as_py()
+            assert actual == expected
+
+# TODO add random-generated cases of float-to-decimal cast:
+# - by generating random integers and scaling them using decimal.Decimal
+# - by generating random floats and converting them to decimal.Decimal
+# XXX move these to test_cast.py / test_decimal.py?
+# TODO overflow tests
+
+
 def test_strptime():
     arr = pa.array(["5/1/2020", None, "12/13/1900"])