[libc] Implement sinf function that is correctly rounded to all round…

…ing modes. Implement sinf function that is correctly rounded to all rounding modes. - We use a simple range reduction for `pi/16 < |x|` : Let `k = round(x / pi)` and `y = (x/pi) - k`. So `k` is an integer and `-0.5 <= y <= 0.5`. Then ``` sin(x) = sin(y*pi + k*pi) = (-1)^(k & 1) * sin(y*pi) ~ (-1)^(k & 1) * y * P(y^2) ``` where `y*P(y^2)` is a degree-15 minimax polynomial generated by Sollya with: ``` > P = fpminimax(sin(x*pi)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|D...|], [0, 0.5]); ``` - Performance benchmark using perf tool from CORE-MATH project (https://gitlab.inria.fr/core-math/core-math/-/tree/master) on Ryzen 1700: Before this patch (not correctly rounded): ``` $ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh sinf CORE-MATH reciprocal throughput : 17.892 System LIBC reciprocal throughput : 25.559 LIBC reciprocal throughput : 29.381 ``` After this patch (correctly rounded): ``` $ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh sinf CORE-MATH reciprocal throughput : 17.896 System LIBC reciprocal throughput : 25.740 LIBC reciprocal throughput : 27.872 LIBC reciprocal throughput : 20.012 (with `-msse4.2` flag) LIBC reciprocal throughput : 14.244 (with `-mfma` flag) ``` Reviewed By: zimmermann6 Differential Revision: https://reviews.llvm.org/D123154
llvm · Jul 22, 2022 · d883a4a · d883a4a
1 parent 4f2cfbe
commit d883a4a
Show file tree

Hide file tree

Showing 12 changed files with 669 additions and 75 deletions.
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -515,7 +515,7 @@ function(add_entrypoint_object target_name)
   list(SORT flags)
 
   if(SHOW_INTERMEDIATE_OBJECTS AND flags)
-    message(STATUS "Object library ${fq_target_name} has FLAGS: ${flags}")
+    message(STATUS "Entrypoint object ${fq_target_name} has FLAGS: ${flags}")
   endif()
 
   if(NOT ADD_TO_EXPAND_NAME)

diff --git a/libc/docs/math.rst b/libc/docs/math.rst
@@ -164,7 +164,7 @@ log            |check|
 log10          |check|
 log1p          |check|
 log2           |check|
-sin            0.561 ULPs       large
+sin            |check|          large
 sincos         0.776 ULPs       large
 sqrt           |check|          |check|         |check|
 ============== ================ =============== ======================
@@ -205,13 +205,13 @@ Performance
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | expm1f       |        14 |                53 |        59 |               146 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| fmodf    (n) |        73 |               263 |        -  |                 - | [MIN_NORMAL, MAX_NORMAL]            | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
+| fmodf        |        73 |               263 |        -  |                 - | [MIN_NORMAL, MAX_NORMAL]            | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
+|              +-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
+|              |         9 |                11 |        -  |                 - | [0, MAX_SUBNORMAL]                  | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| fmodf (d)    |         9 |                11 |        -  |                 - | [0, MAX_SUBNORMAL]                  | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
-+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| fmod (n)     |       595 |              3297 |        -  |                 - | [MIN_NORMAL, MAX_NORMAL]            | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
-+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| fmod (d)     |        14 |                13 |        -  |                 - | [0, MAX_SUBNORMAL]                  | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
+| fmod         |       595 |              3297 |        -  |                 - | [MIN_NORMAL, MAX_NORMAL]            | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
+|              +-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
+|              |        14 |                13 |        -  |                 - | [0, MAX_SUBNORMAL]                  | i5 mobile  | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | hypotf       |        25 |                15 |        64 |                49 | :math:`[-10, 10] \times [-10, 10]`  | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
@@ -223,7 +223,7 @@ Performance
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | log2f        |        13 |                10 |        57 |                46 | :math:`[e^{-1}, e]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| sinf         |        36 |                31 |        72 |                71 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 |               |
+| sinf         |        14 |                26 |        65 |                59 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 
 References

diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -13,6 +13,7 @@ add_header_library(
     NormalFloat.h
     PlatformDefs.h
     builtin_wrappers.h
+    except_value_utils.h
   DEPENDS
     libc.include.math
     libc.include.errno

diff --git a/libc/src/__support/FPUtil/except_value_utils.h b/libc/src/__support/FPUtil/except_value_utils.h
@@ -0,0 +1,70 @@
+//===-- Common header for helpers to set exceptional values -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_EXCEPT_VALUE_UTILS_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_EXCEPT_VALUE_UTILS_H
+
+#include "FEnvImpl.h"
+#include "FPBits.h"
+
+namespace __llvm_libc {
+
+namespace fputil {
+
+template <typename T, int N> struct ExceptionalValues {
+  using UIntType = typename FPBits<T>::UIntType;
+  static constexpr int SIZE = N;
+  // Input bits.
+  UIntType inputs[SIZE];
+  // Output bits contains 4 values:
+  //   output[i][0]: output bits corresponding to FE_TOWARDZERO
+  //   output[i][1]: offset for FE_UPWARD
+  //   output[i][2]: offset for FE_DOWNWARD
+  //   output[i][3]: offset for FE_TONEAREST
+  UIntType outputs[SIZE][4];
+};
+
+template <typename T, int N> struct ExceptionChecker {
+  using UIntType = typename FPBits<T>::UIntType;
+  using FPBits = FPBits<T>;
+  using ExceptionalValues = ExceptionalValues<T, N>;
+
+  static bool check_odd_func(const ExceptionalValues &ExceptVals,
+                             UIntType x_abs, bool sign, T &result) {
+    for (int i = 0; i < N; ++i) {
+      if (unlikely(x_abs == ExceptVals.inputs[i])) {
+        UIntType out_bits = ExceptVals.outputs[i][0]; // FE_TOWARDZERO
+        switch (fputil::get_round()) {
+        case FE_UPWARD:
+          out_bits +=
+              sign ? ExceptVals.outputs[i][2] : ExceptVals.outputs[i][1];
+          break;
+        case FE_DOWNWARD:
+          out_bits +=
+              sign ? ExceptVals.outputs[i][1] : ExceptVals.outputs[i][2];
+          break;
+        case FE_TONEAREST:
+          out_bits += ExceptVals.outputs[i][3];
+          break;
+        }
+        result = FPBits(out_bits).get_val();
+        if (sign)
+          result = -result;
+
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+} // namespace fputil
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_EXCEPT_VALUE_UTILS_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
@@ -76,11 +76,16 @@ add_entrypoint_object(
     sinf.cpp
   HDRS
     ../sinf.h
+    range_reduction.h
+    range_reduction_fma.h
   DEPENDS
-    .sincosf_utils
     libc.include.math
     libc.src.errno.errno
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
 )

diff --git a/libc/src/math/generic/range_reduction.h b/libc/src/math/generic/range_reduction.h
@@ -0,0 +1,131 @@
+//===-- Utilities for trigonometric functions -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+
+namespace __llvm_libc {
+
+namespace generic {
+
+static constexpr uint32_t FAST_PASS_BOUND = 0x4c80'0000U; // 2^26
+
+static constexpr int N_ENTRIES = 8;
+
+// We choose to split bits of 1/pi into 28-bit precision pieces, so that the
+// product of x * ONE_OVER_PI_28[i] is exact.
+// These are generated by Sollya with:
+// > a1 = D(round(1/pi, 28, RN)); a1;
+// > a2 = D(round(1/pi - a1, 28, RN)); a2;
+// > a3 = D(round(1/pi - a1 - a2, 28, RN)); a3;
+// > a4 = D(round(1/pi - a1 - a2 - a3, 28, RN)); a4;
+// ...
+static constexpr double ONE_OVER_PI_28[N_ENTRIES] = {
+    0x1.45f306ep-2,   -0x1.b1bbeaep-33,  0x1.3f84ebp-62,    -0x1.7056592p-92,
+    0x1.c0db62ap-121, -0x1.4cd8778p-150, -0x1.bef806cp-179, 0x1.63abdecp-209};
+
+// Exponents of the least significant bits of the corresponding entries in
+// ONE_OVER_PI_28.
+static constexpr int ONE_OVER_PI_28_LSB_EXP[N_ENTRIES] = {
+    -29, -60, -86, -119, -148, -175, -205, -235};
+
+// Return (k mod 2) and y, where
+//   k = round(x / pi) and y = (x / pi) - k.
+static inline int64_t small_range_reduction(double x, double &y) {
+  double prod = x * ONE_OVER_PI_28[0];
+  double kd = fputil::nearest_integer(prod);
+  y = prod - kd;
+  y = fputil::multiply_add(x, ONE_OVER_PI_28[1], y);
+  y = fputil::multiply_add(x, ONE_OVER_PI_28[2], y);
+  return static_cast<int64_t>(kd);
+}
+
+// Return k and y, where
+//   k = round(x / pi) and y = (x / pi) - k.
+// For large range, there are at most 2 parts of ONE_OVER_PI_28 contributing to
+// the unit binary digit (k & 1).  If the least significant bit of x * the least
+// significant bit of ONE_OVER_PI_28[i] > 1, we can completely ignore
+// ONE_OVER_PI_28[i].
+static inline int64_t large_range_reduction(double x, int x_exp, double &y) {
+  int idx = 0;
+  y = 0;
+  int x_lsb_exp = x_exp - fputil::FloatProperties<float>::MANTISSA_WIDTH;
+
+  // Skipping the first parts of 1/pi such that:
+  //   LSB of x * LSB of ONE_OVER_PI_28[i] > 1.
+  while (x_lsb_exp + ONE_OVER_PI_28_LSB_EXP[idx] > 0)
+    ++idx;
+
+  double prod_hi = x * ONE_OVER_PI_28[idx];
+  // Get the integral part of x * ONE_OVER_PI_28[idx]
+  double k_hi = fputil::nearest_integer(prod_hi);
+  // Get the fractional part of x * ONE_OVER_PI_28[idx]
+  double frac = prod_hi - k_hi;
+  double prod_lo = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 1], frac);
+  double k_lo = fputil::nearest_integer(prod_lo);
+
+  // Now y is the fractional parts.
+  y = prod_lo - k_lo;
+  y = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 2], y);
+  y = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 3], y);
+
+  return static_cast<int64_t>(k_hi + k_lo);
+}
+
+// Exceptional cases.
+static constexpr int N_EXCEPT_SMALL = 4;
+
+static constexpr fputil::ExceptionalValues<float, N_EXCEPT_SMALL> SmallExcepts{
+    /* inputs */ {
+        0x3fa7832a, // x = 0x1.4f0654p0
+        0x46199998, // x = 0x1.33333p13
+        0x4afdece4, // x = 0x1.fbd9c8p22
+        0x4c2332e9, // x = 0x1.4665d2p25
+    },
+    /* outputs (RZ, RU offset, RD offset, RN offset) */
+    {
+        {0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ)
+        {0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ)
+        {0xbf7fb6e0, 0, 1, 1}, // x = 0x1.fbd9c8p22, sin(x) = -0x1.ff6dcp-1 (RZ)
+        {0xbf7fffff, 0, 1,
+         1}, // x = 0x1.4665d2p25, sin(x) = -0x1.fffffep-1 (RZ)
+    }};
+
+static constexpr int N_EXCEPT_LARGE = 5;
+
+static constexpr fputil::ExceptionalValues<float, N_EXCEPT_LARGE> LargeExcepts{
+    /* inputs */ {
+        0x523947f6, // x = 0x1.728fecp37
+        0x53b146a6, // x = 0x1.628d4cp40
+        0x55cafb2a, // x = 0x1.95f654p44
+        0x6a1976f1, // x = 0x1.32ede2p85
+        0x77584625, // x = 0x1.b08c4ap111
+    },
+    /* outputs (RZ, RU offset, RD offset, RN offset) */
+    {
+        {0xbf12791d, 0, 1,
+         1}, // x = 0x1.728fecp37, sin(x) = -0x1.24f23ap-1 (RZ)
+        {0xbf7fffff, 0, 1,
+         1}, // x = 0x1.628d4cp40, sin(x) = -0x1.fffffep-1 (RZ)
+        {0xbf7e7a16, 0, 1,
+         1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ)
+        {0x3f7fffff, 1, 0, 1}, // x = 0x1.32ede2p85, sin(x) = 0x1.fffffep-1 (RZ)
+        {0xbf7fffff, 0, 1,
+         1}, // x = 0x1.b08c4ap111, sin(x) = -0x1.fffffep-1 (RZ)
+    }};
+
+} // namespace generic
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_H