Expand Up
@@ -15,174 +15,63 @@
#include " src/__support/FPUtil/nearest_integer.h"
#include " src/__support/common.h"
#include " src/__support/macros/config.h"
#include " src/__support/macros/optimization.h"
#include " src/math/generic/range_reduction_double_common.h"
namespace LIBC_NAMESPACE_DECL {
namespace nofma {
using fputil::DoubleDouble;
LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 23 ;
LIBC_INLINE unsigned LargeRangeReduction::fast (double x, DoubleDouble &u) {
using FPBits = typename fputil::FPBits<double >;
FPBits xbits (x);
// Digits of 2^(16*i) / pi, generated by Sollya with:
// For [2..62]:
// > for i from 3 to 63 do {
// pi_inv = 2^(16*(i - 3)) / pi;
// pn = nearestint(pi_inv);
// pi_frac = pi_inv - pn;
// a = round(pi_frac, 51, RN);
// b = round(pi_frac - a, 51, RN);
// c = round(pi_frac - a - b, D, RN);
// d = round(pi_frac - a - b - c, D, RN);
// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
// };
// For [0..1]:
// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that
// the conditions for the algorithms are still satisfied, and one of those
// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be
// discarded.
// for i from 0 to 2 do {
// pi_frac = 0.25 + 2^(16*(i - 3)) / pi;
// a = round(pi_frac, 51, RN);
// b = round(pi_frac - a, 51, RN);
// c = round(pi_frac - a - b, D, RN);
// d = round(pi_frac - a - b - c, D, RN);
// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
// };
// For The fast pass using double-double, we only need 3 parts (a, b, c), but
// for the accurate pass using Float128, instead of using another table of
// Float128s, we simply add the fourth path (a, b, c, d), which simplify the
// implementation a bit and saving some memory.
LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64 ][4 ] = {
{0x1 .0000000000014p5, 0x1 .7cc1b727220a8p-49 , 0x1 .4fe13abe8fa9ap-101 ,
0x1 .bb81b6c52b328p -155 },
{0x1 .0000000145f3p5, 0x1 .b727220a94fep -49 , 0x1 .3abe8fa9a6eep-101 ,
0x1 .b6c52b3278872p -155 },
{0x1 .000145f306dc8p5, 0x1 .c882a53f84ebp -47 , -0x1 .70565911f924fp-101 ,
0x1 .2b3278872084p-155 },
{0x1 .45f306dc9c884p5, -0x1 .5ac07b1505c14p-47 , -0x1 .96447e493ad4dp-99 ,
0x1 .3c439041fe516p-154 },
{-0x1 .f246c6efab58p4 , -0x1 .ec5417056591p -49 , -0x1 .f924eb53361dep -101 ,
-0x1 .bef806ba71508p -156 },
{0x1 .391054a7f09d4p4, 0x1 .f47d4d377036cp -48 , 0x1 .8a5664f10e41p-100 ,
0x1 .fe5163abdebbcp -154 },
{0x1 .529fc2757d1f4p2, 0x1 .34ddc0db62958p-50 , 0x1 .93c439041fe51p-102 ,
0x1 .8eaf7aef1586ep-156 },
{-0x1 .ec5417056591p -1 , -0x1 .f924eb53361ep -53 , 0x1 .c820ff28b1d5fp -105 ,
-0x1 .443a9e48db91cp-162 },
{-0x1 .505c1596447e4p5, -0x1 .275a99b0ef1cp-48 , 0x1 .07f9458eaf7afp-100 ,
-0x1 .d4f246dc8e2dfp -157 },
{-0x1 .596447e493ad4p1, -0x1 .9b0ef1bef806cp-52 , 0x1 .63abdebbc561bp-106 ,
0x1 .c91b8e909374cp -160 },
{0x1 .bb81b6c52b328p5 , -0x1 .de37df00d74e4p -49 , 0x1 .5ef5de2b0db92p-101 ,
0x1 .b8e909374b802p -156 },
{0x1 .b6c52b3278874p5 , -0x1 .f7c035d38a844p -47 , 0x1 .778ac36e48dc7p-99 ,
0x1 .2126e97003249p-153 },
{0x1 .2b3278872084p5, -0x1 .ae9c5421443a8p -50 , -0x1 .e48db91c5bdb2p -102 ,
-0x1 .68ffcdb688afbp-157 },
{-0x1 .8778df7c035d4p5, 0x1 .d5ef5de2b0db8p -49 , 0x1 .2371d2126e97p-101 ,
0x1 .924bba8274648p-160 },
{-0x1 .bef806ba71508p4 , -0x1 .443a9e48db91cp-50 , -0x1 .6f6c8b47fe6dbp-104 ,
-0x1 .115f62e6de302p-158 },
{-0x1 .ae9c5421443a8p -2 , -0x1 .e48db91c5bdb4p -54 , 0x1 .d2e006492eea1p -106 ,
-0x1 .8b9b78c078854p-160 },
{-0x1 .38a84288753c8p5, -0x1 .1b7238b7b645cp-47 , 0x1 .c00c925dd413ap -99 ,
0x1 .921cfe1deb1cbp-154 },
{-0x1 .0a21d4f246dc8p3, -0x1 .c5bdb22d1ff9cp -50 , 0x1 .25dd413a3243ap-103 ,
-0x1 .e214e34ed658cp -162 },
{-0x1 .d4f246dc8e2ep3 , 0x1 .26e9700324978p-49 , -0x1 .5f62e6de301e2p-102 ,
-0x1 .4e34ed658c117p-158 },
{-0x1 .236e4716f6c8cp4, 0x1 .700324977505p-49 , -0x1 .736f180f10a72p-101 ,
0x1 .62534e7dd1047p-155 },
{0x1 .b8e909374b8p4 , 0x1 .924bba8274648p-48 , 0x1 .cfe1deb1cb12ap -102 ,
-0x1 .63045df7282b4p-156 },
{0x1 .09374b801924cp4, -0x1 .15f62e6de302p-50 , 0x1 .deb1cb129a73fp -102 ,
-0x1 .77dca0ad144bbp-158 },
{-0x1 .68ffcdb688afcp3, 0x1 .d1921cfe1debp -50 , 0x1 .cb129a73ee882p -102 ,
0x1 .afa975da24275p -157 },
{0x1 .924bba8274648p0, 0x1 .cfe1deb1cb128p -54 , 0x1 .a73ee88235f53p -106 ,
-0x1 .44bb7b16638fep-162 },
{-0x1 .a22bec5cdbc6p5 , -0x1 .e214e34ed658cp -50 , -0x1 .177dca0ad144cp-106 ,
0x1 .213a671c09ad1p-160 },
{0x1 .3a32439fc3bd8p1, -0x1 .c69dacb1822fp -51 , 0x1 .1afa975da2427p-105 ,
0x1 .338e04d68befdp-159 },
{-0x1 .b78c0788538d4p4 , 0x1 .29a73ee88236p-50 , -0x1 .5a28976f62cc7p-103 ,
-0x1 .fb29741037d8dp -159 },
{0x1 .fc3bd63962534p5 , 0x1 .cfba208d7d4bcp -48 , -0x1 .12edec598e3f6p-100 ,
-0x1 .4ba081bec66e3p-154 },
{-0x1 .4e34ed658c118p2, 0x1 .046bea5d7689p-51 , 0x1 .3a671c09ad17ep-104 ,
-0x1 .bec66e29c67cbp -162 },
{0x1 .62534e7dd1048p5, -0x1 .415a28976f62cp-47 , -0x1 .8e3f652e8207p-100 ,
0x1 .3991d63983534p-154 },
{-0x1 .63045df7282b4p4, -0x1 .44bb7b16638fcp-50 , -0x1 .94ba081bec66ep-102 ,
-0x1 .4e33e566305b2p-157 },
{0x1 .d1046bea5d768p5 , 0x1 .213a671c09adp-48 , 0x1 .7df904e64758ep-100 ,
0x1 .835339f49c846p-154 },
{0x1 .afa975da24274p3 , 0x1 .9c7026b45f7e4p-50 , 0x1 .3991d63983534p-106 ,
-0x1 .82d8dee81d108p-160 },
{-0x1 .a28976f62cc7p5 , -0x1 .fb29741037d8cp -47 , -0x1 .b8a719f2b3183p -100 ,
0x1 .3908bf177bf25p-155 },
{-0x1 .76f62cc71fb28p5, -0x1 .741037d8cdc54p-47 , 0x1 .cc1a99cfa4e42p -101 ,
0x1 .7e2ef7e4a0ec8p-156 },
{0x1 .d338e04d68bfp5 , -0x1 .bec66e29c67ccp -50 , 0x1 .339f49c845f8cp-102 ,
-0x1 .081b5f13801dap-156 },
{0x1 .c09ad17df905p4 , -0x1 .9b8a719f2b318p-48 , -0x1 .6c6f740e8840ep-103 ,
0x1 .41d8ffc4bffefp-157 },
{0x1 .68befc827323cp5, -0x1 .38cf9598c16c8p-47 , 0x1 .08bf177bf2507p-99 ,
0x1 .8ffc4bffef02dp-153 },
{-0x1 .037d8cdc538dp5, 0x1 .a99cfa4e422fcp -49 , 0x1 .77bf250763ff1p-103 ,
0x1 .7ffde05980fefp-158 },
{-0x1 .8cdc538cf9598p5, -0x1 .82d8dee81d108p-48 , -0x1 .b5f13801da001p -104 ,
0x1 .e05980fef2f12p -158 },
{-0x1 .4e33e566305bp3, -0x1 .bdd03a21036cp -49 , 0x1 .d8ffc4bffef03p -101 ,
-0x1 .9fc04343b9d29p-156 },
{-0x1 .f2b3182d8dee8p4 , -0x1 .d1081b5f138p -52 , -0x1 .da00087e99fcp -104 ,
-0x1 .0d0ee74a5f593p-158 },
{-0x1 .8c16c6f740e88p5, -0x1 .036be27003b4p-49 , -0x1 .0fd33f8086877p-109 ,
-0x1 .d297d64b824b2p -164 },
{0x1 .3908bf177bf24p5, 0x1 .0763ff12fffbcp-47 , 0x1 .6603fbcbc462dp-104 ,
0x1 .a0a6d1f6d367fp -158 },
{0x1 .7e2ef7e4a0ec8p4, -0x1 .da00087e99fcp -56 , -0x1 .0d0ee74a5f593p-110 ,
0x1 .f6d367ecf27cbp -166 },
{-0x1 .081b5f13801dcp4, 0x1 .fff7816603fbcp -48 , 0x1 .788c5ad05369p-101 ,
-0x1 .25930261b069fp-155 },
{-0x1 .af89c00ed0004p5 , -0x1 .fa67f010d0ee8p -50 , 0x1 .6b414da3eda6dp-103 ,
-0x1 .30d834f648b0cp-162 },
{-0x1 .c00ed00043f4cp5 , -0x1 .fc04343b9d298p -48 , 0x1 .4da3eda6cfd9ep-103 ,
0x1 .3e584dba7a32p-157 },
{0x1 .2fffbc0b301fcp5, 0x1 .e5e2316b414dcp -47 , -0x1 .c125930261b07p -99 ,
0x1 .84dba7a31fb35p-153 },
{-0x1 .0fd33f8086878p3, 0x1 .8b5a0a6d1f6d4p-50 , -0x1 .30261b069ec91p-103 ,
-0x1 .85ce04cb0d00bp-157 },
{-0x1 .9fc04343b9d28p4, -0x1 .7d64b824b2604p-48 , -0x1 .86c1a7b24585dp-101 ,
0x1 .fb34f2ff516bbp -157 },
{-0x1 .0d0ee74a5f594p2, 0x1 .1f6d367ecf27cp-50 , 0x1 .6136e9e8c7ecdp-103 ,
0x1 .e5fea2d7527bbp -158 },
{-0x1 .dce94beb25c14p5 , 0x1 .a6cfd9e4f9614p -47 , -0x1 .22c2e70265868p-100 ,
-0x1 .5d28ad8453814p-158 },
{-0x1 .4beb25c12593p5, -0x1 .30d834f648b0cp-50 , 0x1 .8fd9a797fa8b6p-104 ,
-0x1 .5b08a7028341dp-159 },
{0x1 .b47db4d9fb3c8p4 , 0x1 .f2c26dd3d18fcp -48 , 0x1 .9a797fa8b5d4ap-100 ,
-0x1 .14e050683a131p-156 },
{-0x1 .25930261b06ap5, 0x1 .36e9e8c7ecd3cp-47 , 0x1 .7fa8b5d49eeb2p-100 ,
-0x1 .41a0e84c2f8c6p-158 },
{0x1 .fb3c9f2c26dd4p4 , -0x1 .738132c3402bcp-51 , 0x1 .aea4f758fd7ccp -103 ,
-0x1 .d0985f18c10ebp -159 },
{-0x1 .b069ec9161738p5 , -0x1 .32c3402ba515cp-51 , 0x1 .eeb1faf97c5edp -104 ,
-0x1 .7c63043ad6b69p-161 },
{-0x1 .ec9161738132cp5 , -0x1 .a015d28ad8454p -50 , 0x1 .faf97c5ecf41dp -104 ,
-0x1 .821d6b5b4565p-160 },
{-0x1 .61738132c3404p5, 0x1 .45aea4f758fd8p-47 , -0x1 .a0e84c2f8c608p -102 ,
-0x1 .d6b5b45650128p -156 },
{0x1 .fb34f2ff516bcp3 , -0x1 .6c229c0a0d074p-49 , -0x1 .30be31821d6b6p-104 ,
0x1 .2ea6bfb5fb12p-158 },
{0x1 .3cbfd45aea4f8p5, -0x1 .4e050683a130cp-48 , 0x1 .ce7de294a4baap -104 ,
-0x1 .404a04ee072a3p-158 },
{-0x1 .5d28ad8453814p2, -0x1 .a0e84c2f8c608p -54 , -0x1 .d6b5b45650128p -108 ,
-0x1 .3b81ca8bdea7fp-164 },
{-0x1 .15b08a702834p5, -0x1 .d0985f18c10ecp -47 , 0x1 .4a4ba9afed7ecp-100 ,
0x1 .1f8d5d0856033p-154 },
};
int x_e_m62 = xbits.get_biased_exponent () - (FPBits::EXP_BIAS + 62 );
idx = static_cast <unsigned >((x_e_m62 >> 4 ) + 3 );
// Scale x down by 2^(-(16 * (idx - 3))
xbits.set_biased_exponent ((x_e_m62 & 15 ) + FPBits::EXP_BIAS + 62 );
// 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
x_reduced = xbits.get_val ();
// x * c_hi = ph.hi + ph.lo exactly.
DoubleDouble x_split = fputil::split (x_reduced);
DoubleDouble ph = fputil::exact_mult<SPLIT>(x_split, x_reduced,
ONE_TWENTY_EIGHT_OVER_PI[idx][0 ]);
// x * c_mid = pm.hi + pm.lo exactly.
DoubleDouble pm = fputil::exact_mult<SPLIT>(x_split, x_reduced,
ONE_TWENTY_EIGHT_OVER_PI[idx][1 ]);
// x * c_lo = pl.hi + pl.lo exactly.
DoubleDouble pl = fputil::exact_mult<SPLIT>(x_split, x_reduced,
ONE_TWENTY_EIGHT_OVER_PI[idx][2 ]);
// Extract integral parts and fractional parts of (ph.lo + pm.hi).
double sum_hi = ph.lo + pm.hi ;
double kd = fputil::nearest_integer (sum_hi);
// x * 128/pi mod 1 ~ y_hi + y_mid + y_lo
y_hi = (ph.lo - kd) + pm.hi ; // Exact
y_mid = fputil::exact_add (pm.lo , pl.hi );
y_lo = pl.lo ;
// y_l = x * c_lo_2 + pl.lo
double y_l =
fputil::multiply_add (x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][3 ], y_lo);
DoubleDouble y = fputil::exact_add (y_hi, y_mid.hi );
y.lo += (y_mid.lo + y_l);
// Digits of pi/128, generated by Sollya with:
// > a = round(pi/128, D, RN);
// > b = round(pi/128 - a, D, RN);
constexpr DoubleDouble PI_OVER_128_DD = {0x1 .1a62633145c07p-60 ,
0x1 .921fb54442d18p-6 };
// Error bound: with {a} denote the fractional part of a, i.e.:
// {a} = a - round(a)
// Then,
// | {x * 128/pi} - (y_hi + y_lo) | <= ulp(ulp(y_hi)) <= 2^-105
// | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-105 = 2^-110
u = fputil::quick_mult<SPLIT>(y, PI_OVER_128_DD);
return static_cast <unsigned >(static_cast <int64_t >(kd));
}
// Lookup table for sin(k * pi / 128) with k = 0, ..., 255.
// Table is generated with Sollya as follow:
Expand Down
Expand Up
@@ -258,6 +147,7 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
{0x1 .e3a843d1db55fp -53 , 0x1 .ff621e3796d7cp -1 },
{0x1 .765595d548d9ap-54 , 0x1 .ffd886084cd0cp -1 },
{0 , 1 },
#ifndef LIBC_MATH_HAS_SMALL_TABLES
{0x1 .765595d548d9ap-54 , 0x1 .ffd886084cd0cp -1 },
{0x1 .e3a843d1db55fp -53 , 0x1 .ff621e3796d7cp -1 },
{-0x1 .eade132f3981dp -53 , 0x1 .fe9cdad01883cp -1 },
Expand Down
Expand Up
@@ -449,46 +339,9 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
{-0x1 .ccbeeeae8129ap -56 , -0x1 .2d52092ce19f4p-4 },
{0x1 .912bd0d569a9p-61 , -0x1 .91f65f10dd814p-5 },
{-0x1 .f938a73db97fbp -58 , -0x1 .92155f7a3667cp-6 },
#endif // !LIBC_MATH_HAS_SMALL_TABLES
};
LIBC_INLINE unsigned range_reduction_small (double x, DoubleDouble &u) {
constexpr double ONE_TWENTY_EIGHT_OVER_PI = 0x1 .45f306dc9c883p5;
// Digits of -pi/128, generated by Sollya with:
// > a = round(-pi/128, 25, RN);
// > b = round(-pi/128 - a, 23, RN);
// > c = round(-pi/128 - a - b, 25, RN);
// > d = round(-pi/128 - a - b - c, D, RN);
// -pi/128 ~ a + b + c + d
// The precisions of the parts are chosen so that:
// 1) k * a, k * b, k * c are exact in double precision
// 2) k * b + (x - (k * a)) is exact in double precsion
constexpr double MPI_OVER_128[4 ] = {-0x1 .921fb5p-6 , -0x1 .110b48p-32 ,
+0x1 .ee59dap -56 , -0x1 .98a2e03707345p-83 };
double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI;
double kd = fputil::nearest_integer (prod_hi);
// With -pi/128 ~ a + b + c + d as in MPI_OVER_128 description:
// t = x + k * a
double t = fputil::multiply_add (kd, MPI_OVER_128[0 ], x); // Exact
// y_hi = t + k * b = (x + k * a) + k * b
double y_hi = fputil::multiply_add (kd, MPI_OVER_128[1 ], t); // Exact
// y_lo ~ k * c + k * d
double y_lo = fputil::multiply_add (kd, MPI_OVER_128[2 ], kd * MPI_OVER_128[3 ]);
// u.hi + u.lo ~ x + k * (a + b + c + d)
u = fputil::exact_add (y_hi, y_lo);
// Error bound: For |x| < 2^-23,
// |(x mod pi/128) - (u_hi + u_lo)| < ulp(y_lo)
// <= ulp(2 * x * c)
// <= ulp(2^24 * 2^-56)
// = 2^(24 - 56 - 52)
// = 2^-84
return static_cast <unsigned >(static_cast <int >(kd));
}
} // namespace nofma
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H