[libc][math] Implement double precision sin correctly rounded to all …

…rounding modes. (#95736) - Algorithm: - Step 1 - Range reduction: for a double precision input `x`, return `k` and `u` such that - k is an integer - u = x - k * pi / 128, and |u| < pi/256 - Step 2 - Calculate `sin(u)` and `cos(u)` in double-double using Taylor polynomials with errors < 2^-70 with FMA or < 2^-66 w/o FMA. - Step 3 - Calculate `sin(x) = sin(k*pi/128) * cos(u) + cos(k*pi/128) * sin(u)` using look-up table for `sin(k*pi/128)` and `cos(k*pi/128)`. - Step 4 - Use Ziv's rounding test to decide if the result is correctly rounded. - Step 4' - If the Ziv's rounding test failed, redo step 1-3 using 128-bit precision. - Currently, without FMA instructions, the large range reduction only works correctly for the default rounding mode (FE_TONEAREST). - Provide `LIBC_MATH` flag so that users can set `LIBC_MATH = LIBC_MATH_SKIP_ACCURATE_PASS` to build the `sin` function without step 4 and 4'.
llvm · Jun 24, 2024 · 16903ac · 16903ac
1 parent 5ae5069
commit 16903ac
Show file tree

Hide file tree

Showing 19 changed files with 1,790 additions and 59 deletions.
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
@@ -226,6 +226,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbnl
     libc.src.math.sincosf
     libc.src.math.sinhf
+    libc.src.math.sin
     libc.src.math.sinf
     libc.src.math.sqrt
     libc.src.math.sqrtf

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
@@ -481,6 +481,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbnl
     libc.src.math.sincosf
     libc.src.math.sinhf
+    libc.src.math.sin
     libc.src.math.sinf
     libc.src.math.sqrt
     libc.src.math.sqrtf

diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
@@ -354,6 +354,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbnf
     libc.src.math.scalbnl
     libc.src.math.sincosf
+    libc.src.math.sin
     libc.src.math.sinf
     libc.src.math.sinhf
     libc.src.math.sqrt

diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
@@ -489,6 +489,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbnl
     libc.src.math.sincosf
     libc.src.math.sinhf
+    libc.src.math.sin
     libc.src.math.sinf
     libc.src.math.sqrt
     libc.src.math.sqrtf

diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
@@ -322,7 +322,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | rsqrt     |                  |                 |                        |                      |                        | 7.12.7.9               | F.10.4.9                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sin       | |check|          | large           |                        |                      |                        | 7.12.4.6               | F.10.1.6                   |
+| sin       | |check|          | |check|         |                        |                      |                        | 7.12.4.6               | F.10.1.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sincos    | |check|          | large           |                        |                      |                        |                        |                            |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+

diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
@@ -21,12 +21,22 @@ using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
 // The output of Dekker's FastTwoSum algorithm is correct, i.e.:
 //   r.hi + r.lo = a + b exactly
 //   and |r.lo| < eps(r.lo)
-// if ssumption: |a| >= |b|, or a = 0.
+// Assumption: |a| >= |b|, or a = 0.
+template <bool FAST2SUM = true>
 LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) {
   DoubleDouble r{0.0, 0.0};
-  r.hi = a + b;
-  double t = r.hi - a;
-  r.lo = b - t;
+  if constexpr (FAST2SUM) {
+    r.hi = a + b;
+    double t = r.hi - a;
+    r.lo = b - t;
+  } else {
+    r.hi = a + b;
+    double t1 = r.hi - a;
+    double t2 = r.hi - t1;
+    double t3 = b - t1;
+    double t4 = a - t2;
+    r.lo = t3 + t4;
+  }
   return r;
 }
 
@@ -40,22 +50,35 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a,
 
 // Assumption: |a.hi| >= |b|
 LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
-  DoubleDouble r = exact_add(a.hi, b);
+  DoubleDouble r = exact_add<false>(a.hi, b);
   return exact_add(r.hi, r.lo + a.lo);
 }
 
-// Velkamp's Splitting for double precision.
-LIBC_INLINE constexpr DoubleDouble split(double a) {
+// Veltkamp's Splitting for double precision.
+// Note: This is proved to be correct for all rounding modes:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
+template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
   DoubleDouble r{0.0, 0.0};
-  // Splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-  constexpr double C = 0x1.0p27 + 1.0;
+  // CN = 2^N.
+  constexpr double CN = static_cast<double>(1 << N);
+  constexpr double C = CN + 1.0;
   double t1 = C * a;
   double t2 = a - t1;
   r.hi = t1 + t2;
   r.lo = a - r.hi;
   return r;
 }
 
+// Note: When FMA instruction is not available, the `exact_mult` function is
+// only correct for round-to-nearest mode.  See:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
+// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
+// then a * b = r.hi + r.lo is exact for all rounding modes.
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
   DoubleDouble r{0.0, 0.0};
 
@@ -65,7 +88,13 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
 #else
   // Dekker's Product.
   DoubleDouble as = split(a);
-  DoubleDouble bs = split(b);
+  DoubleDouble bs;
+
+  if constexpr (NO_FMA_ALL_ROUNDINGS)
+    bs = split<28>(b);
+  else
+    bs = split(b);
+
   r.hi = a * b;
   double t1 = as.hi * bs.hi - r.hi;
   double t2 = as.hi * bs.lo + t1;
@@ -82,9 +111,10 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
   return r;
 }
 
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
                                     const DoubleDouble &b) {
-  DoubleDouble r = exact_mult(a.hi, b.hi);
+  DoubleDouble r = exact_mult<NO_FMA_ALL_ROUNDINGS>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
   r.lo = t2;

diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
@@ -278,11 +278,11 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
 // don't need to normalize the inputs again in this function.  If the inputs are
 // not normalized, the results might lose precision significantly.
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(DyadicFloat<Bits> a,
-                                                  DyadicFloat<Bits> b) {
+LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
+                                                  const DyadicFloat<Bits> &b) {
   DyadicFloat<Bits> result;
   result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
-  result.exponent = a.exponent + b.exponent + int(Bits);
+  result.exponent = a.exponent + b.exponent + static_cast<int>(Bits);
 
   if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) {
     result.mantissa = a.mantissa.quick_mul_hi(b.mantissa);
@@ -309,7 +309,7 @@ multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b,
 // Simple exponentiation implementation for printf. Only handles positive
 // exponents, since division isn't implemented.
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(DyadicFloat<Bits> a,
+LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a,
                                               uint32_t power) {
   DyadicFloat<Bits> result = 1.0;
   DyadicFloat<Bits> cur_power = a;
@@ -325,7 +325,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(DyadicFloat<Bits> a,
 }
 
 template <size_t Bits>
-LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(DyadicFloat<Bits> a,
+LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a,
                                                   int32_t pow_2) {
   DyadicFloat<Bits> result = a;
   result.exponent += pow_2;

diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
@@ -33,4 +33,18 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 #error "Unhandled compiler"
 #endif
 
+// Defining optimization options for math functions.
+// TODO: Exporting this to public generated headers?
+#define LIBC_MATH_SKIP_ACCURATE_PASS 0x01
+#define LIBC_MATH_SMALL_TABLES 0x02
+#define LIBC_MATH_NO_ERRNO 0x04
+#define LIBC_MATH_NO_EXCEPT 0x08
+#define LIBC_MATH_FAST                                                         \
+  (LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES |                     \
+   LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)
+
+#ifndef LIBC_MATH
+#define LIBC_MATH 0
+#endif // LIBC_MATH
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
@@ -135,6 +135,23 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  range_reduction_double
+  HDRS
+    range_reduction_double_common.h
+    range_reduction_double_fma.h
+    range_reduction_double_nofma.h
+  DEPENDS
+  libc.src.__support.FPUtil.double_double
+  libc.src.__support.FPUtil.dyadic_float
+  libc.src.__support.FPUtil.fp_bits
+  libc.src.__support.FPUtil.fma
+  libc.src.__support.FPUtil.multiply_add
+  libc.src.__support.FPUtil.nearest_integer
+  libc.src.__support.common
+  libc.src.__support.integer_literals
+)
+
 add_header_library(
   sincosf_utils
   HDRS
@@ -146,6 +163,15 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  sincos_eval
+  HDRS
+    sincos_eval.h
+  DEPENDS
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.multiply_add
+)
+
 add_entrypoint_object(
   cosf
   SRCS
@@ -167,6 +193,29 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sin
+  SRCS
+    sin.cpp
+  HDRS
+    ../sin.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.errno.errno
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sinf
   SRCS