Skip to content

Commit e2ee42f

Browse files
authored
[libc][math] Fix parallel implementation for asin (#178100)
1 parent acff9fa commit e2ee42f

File tree

2 files changed

+2
-262
lines changed

2 files changed

+2
-262
lines changed

libc/src/__support/math/asin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
2525

2626
namespace math {
2727

28-
LIBC_INLINE static constexpr double asin(double x) {
28+
LIBC_INLINE static double asin(double x) {
2929
using namespace asin_internal;
3030
using FPBits = fputil::FPBits<double>;
3131

libc/src/math/generic/asin.cpp

Lines changed: 1 addition & 261 deletions
Original file line numberDiff line numberDiff line change
@@ -11,266 +11,6 @@
1111

1212
namespace LIBC_NAMESPACE_DECL {
1313

14-
LLVM_LIBC_FUNCTION(double, asin, (double x)) {
15-
using namespace asin_internal;
16-
using FPBits = fputil::FPBits<double>;
17-
18-
FPBits xbits(x);
19-
int x_exp = xbits.get_biased_exponent();
20-
21-
// |x| < 0.5.
22-
if (x_exp < FPBits::EXP_BIAS - 1) {
23-
// |x| < 2^-26.
24-
if (LIBC_UNLIKELY(x_exp < FPBits::EXP_BIAS - 26)) {
25-
// When |x| < 2^-26, the relative error of the approximation asin(x) ~ x
26-
// is:
27-
// |asin(x) - x| / |asin(x)| < |x^3| / (6|x|)
28-
// = x^2 / 6
29-
// < 2^-54
30-
// < epsilon(1)/2.
31-
// So the correctly rounded values of asin(x) are:
32-
// = x + sign(x)*eps(x) if rounding mode = FE_TOWARDZERO,
33-
// or (rounding mode = FE_UPWARD and x is
34-
// negative),
35-
// = x otherwise.
36-
// To simplify the rounding decision and make it more efficient, we use
37-
// fma(x, 2^-54, x) instead.
38-
// Note: to use the formula x + 2^-54*x to decide the correct rounding, we
39-
// do need fma(x, 2^-54, x) to prevent underflow caused by 2^-54*x when
40-
// |x| < 2^-1022. For targets without FMA instructions, when x is close to
41-
// denormal range, we normalize x,
42-
#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS)
43-
return x;
44-
#elif defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
45-
return fputil::multiply_add(x, 0x1.0p-54, x);
46-
#else
47-
if (xbits.abs().uintval() == 0)
48-
return x;
49-
// Get sign(x) * min_normal.
50-
FPBits eps_bits = FPBits::min_normal();
51-
eps_bits.set_sign(xbits.sign());
52-
double eps = eps_bits.get_val();
53-
double normalize_const = (x_exp == 0) ? eps : 0.0;
54-
double scaled_normal =
55-
fputil::multiply_add(x + normalize_const, 0x1.0p54, eps);
56-
return fputil::multiply_add(scaled_normal, 0x1.0p-54, -normalize_const);
57-
#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
58-
}
59-
60-
#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
61-
return x * asin_eval(x * x);
62-
#else
63-
unsigned idx;
64-
DoubleDouble x_sq = fputil::exact_mult(x, x);
65-
double err = xbits.abs().get_val() * 0x1.0p-51;
66-
// Polynomial approximation:
67-
// p ~ asin(x)/x
68-
69-
DoubleDouble p = asin_eval(x_sq, idx, err);
70-
// asin(x) ~ x * (ASIN_COEFFS[idx][0] + p)
71-
DoubleDouble r0 = fputil::exact_mult(x, p.hi);
72-
double r_lo = fputil::multiply_add(x, p.lo, r0.lo);
73-
74-
// Ziv's accuracy test.
75-
76-
double r_upper = r0.hi + (r_lo + err);
77-
double r_lower = r0.hi + (r_lo - err);
78-
79-
if (LIBC_LIKELY(r_upper == r_lower))
80-
return r_upper;
81-
82-
// Ziv's accuracy test failed, perform 128-bit calculation.
83-
84-
// Recalculate mod 1/64.
85-
idx = static_cast<unsigned>(fputil::nearest_integer(x_sq.hi * 0x1.0p6));
86-
87-
// Get x^2 - idx/64 exactly. When FMA is available, double-double
88-
// multiplication will be correct for all rounding modes. Otherwise we use
89-
// Float128 directly.
90-
Float128 x_f128(x);
91-
92-
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
93-
// u = x^2 - idx/64
94-
Float128 u_hi(
95-
fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, x_sq.hi));
96-
Float128 u = fputil::quick_add(u_hi, Float128(x_sq.lo));
97-
#else
98-
Float128 x_sq_f128 = fputil::quick_mul(x_f128, x_f128);
99-
Float128 u = fputil::quick_add(
100-
x_sq_f128, Float128(static_cast<double>(idx) * (-0x1.0p-6)));
101-
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
102-
103-
Float128 p_f128 = asin_eval(u, idx);
104-
Float128 r = fputil::quick_mul(x_f128, p_f128);
105-
106-
return static_cast<double>(r);
107-
#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
108-
}
109-
// |x| >= 0.5
110-
111-
double x_abs = xbits.abs().get_val();
112-
113-
// Maintaining the sign:
114-
constexpr double SIGN[2] = {1.0, -1.0};
115-
double x_sign = SIGN[xbits.is_neg()];
116-
117-
// |x| >= 1
118-
if (LIBC_UNLIKELY(x_exp >= FPBits::EXP_BIAS)) {
119-
// x = +-1, asin(x) = +- pi/2
120-
if (x_abs == 1.0) {
121-
// return +- pi/2
122-
return fputil::multiply_add(x_sign, PI_OVER_TWO.hi,
123-
x_sign * PI_OVER_TWO.lo);
124-
}
125-
// |x| > 1, return NaN.
126-
if (xbits.is_quiet_nan())
127-
return x;
128-
129-
// Set domain error for non-NaN input.
130-
if (!xbits.is_nan())
131-
fputil::set_errno_if_required(EDOM);
132-
133-
fputil::raise_except_if_required(FE_INVALID);
134-
return FPBits::quiet_nan().get_val();
135-
}
136-
137-
// When |x| >= 0.5, we perform range reduction as follow:
138-
//
139-
// Assume further that 0.5 <= x < 1, and let:
140-
// y = asin(x)
141-
// We will use the double angle formula:
142-
// cos(2y) = 1 - 2 sin^2(y)
143-
// and the complement angle identity:
144-
// x = sin(y) = cos(pi/2 - y)
145-
// = 1 - 2 sin^2 (pi/4 - y/2)
146-
// So:
147-
// sin(pi/4 - y/2) = sqrt( (1 - x)/2 )
148-
// And hence:
149-
// pi/4 - y/2 = asin( sqrt( (1 - x)/2 ) )
150-
// Equivalently:
151-
// asin(x) = y = pi/2 - 2 * asin( sqrt( (1 - x)/2 ) )
152-
// Let u = (1 - x)/2, then:
153-
// asin(x) = pi/2 - 2 * asin( sqrt(u) )
154-
// Moreover, since 0.5 <= x < 1:
155-
// 0 < u <= 1/4, and 0 < sqrt(u) <= 0.5,
156-
// And hence we can reuse the same polynomial approximation of asin(x) when
157-
// |x| <= 0.5:
158-
// asin(x) ~ pi/2 - 2 * sqrt(u) * P(u),
159-
160-
// u = (1 - |x|)/2
161-
double u = fputil::multiply_add(x_abs, -0.5, 0.5);
162-
// v_hi + v_lo ~ sqrt(u).
163-
// Let:
164-
// h = u - v_hi^2 = (sqrt(u) - v_hi) * (sqrt(u) + v_hi)
165-
// Then:
166-
// sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
167-
// ~ v_hi + h / (2 * v_hi)
168-
// So we can use:
169-
// v_lo = h / (2 * v_hi).
170-
// Then,
171-
// asin(x) ~ pi/2 - 2*(v_hi + v_lo) * P(u)
172-
double v_hi = fputil::sqrt<double>(u);
173-
174-
#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
175-
double p = asin_eval(u);
176-
double r = x_sign * fputil::multiply_add(-2.0 * v_hi, p, PI_OVER_TWO.hi);
177-
return r;
178-
#else
179-
180-
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
181-
double h = fputil::multiply_add(v_hi, -v_hi, u);
182-
#else
183-
DoubleDouble v_hi_sq = fputil::exact_mult(v_hi, v_hi);
184-
double h = (u - v_hi_sq.hi) - v_hi_sq.lo;
185-
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
186-
187-
// Scale v_lo and v_hi by 2 from the formula:
188-
// vh = v_hi * 2
189-
// vl = 2*v_lo = h / v_hi.
190-
double vh = v_hi * 2.0;
191-
double vl = h / v_hi;
192-
193-
// Polynomial approximation:
194-
// p ~ asin(sqrt(u))/sqrt(u)
195-
unsigned idx;
196-
double err = vh * 0x1.0p-51;
197-
198-
DoubleDouble p = asin_eval(DoubleDouble{0.0, u}, idx, err);
199-
200-
// Perform computations in double-double arithmetic:
201-
// asin(x) = pi/2 - (v_hi + v_lo) * (ASIN_COEFFS[idx][0] + p)
202-
DoubleDouble r0 = fputil::quick_mult(DoubleDouble{vl, vh}, p);
203-
DoubleDouble r = fputil::exact_add(PI_OVER_TWO.hi, -r0.hi);
204-
205-
double r_lo = PI_OVER_TWO.lo - r0.lo + r.lo;
206-
207-
// Ziv's accuracy test.
208-
209-
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
210-
double r_upper = fputil::multiply_add(
211-
r.hi, x_sign, fputil::multiply_add(r_lo, x_sign, err));
212-
double r_lower = fputil::multiply_add(
213-
r.hi, x_sign, fputil::multiply_add(r_lo, x_sign, -err));
214-
#else
215-
r_lo *= x_sign;
216-
r.hi *= x_sign;
217-
double r_upper = r.hi + (r_lo + err);
218-
double r_lower = r.hi + (r_lo - err);
219-
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
220-
221-
if (LIBC_LIKELY(r_upper == r_lower))
222-
return r_upper;
223-
224-
// Ziv's accuracy test failed, we redo the computations in Float128.
225-
// Recalculate mod 1/64.
226-
idx = static_cast<unsigned>(fputil::nearest_integer(u * 0x1.0p6));
227-
228-
// After the first step of Newton-Raphson approximating v = sqrt(u), we have
229-
// that:
230-
// sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
231-
// v_lo = h / (2 * v_hi)
232-
// With error:
233-
// sqrt(u) - (v_hi + v_lo) = h * ( 1/(sqrt(u) + v_hi) - 1/(2*v_hi) )
234-
// = -h^2 / (2*v * (sqrt(u) + v)^2).
235-
// Since:
236-
// (sqrt(u) + v_hi)^2 ~ (2sqrt(u))^2 = 4u,
237-
// we can add another correction term to (v_hi + v_lo) that is:
238-
// v_ll = -h^2 / (2*v_hi * 4u)
239-
// = -v_lo * (h / 4u)
240-
// = -vl * (h / 8u),
241-
// making the errors:
242-
// sqrt(u) - (v_hi + v_lo + v_ll) = O(h^3)
243-
// well beyond 128-bit precision needed.
244-
245-
// Get the rounding error of vl = 2 * v_lo ~ h / vh
246-
// Get full product of vh * vl
247-
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
248-
double vl_lo = fputil::multiply_add(-v_hi, vl, h) / v_hi;
249-
#else
250-
DoubleDouble vh_vl = fputil::exact_mult(v_hi, vl);
251-
double vl_lo = ((h - vh_vl.hi) - vh_vl.lo) / v_hi;
252-
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
253-
// vll = 2*v_ll = -vl * (h / (4u)).
254-
double t = h * (-0.25) / u;
255-
double vll = fputil::multiply_add(vl, t, vl_lo);
256-
// m_v = -(v_hi + v_lo + v_ll).
257-
Float128 m_v = fputil::quick_add(
258-
Float128(vh), fputil::quick_add(Float128(vl), Float128(vll)));
259-
m_v.sign = Sign::NEG;
260-
261-
// Perform computations in Float128:
262-
// asin(x) = pi/2 - (v_hi + v_lo + vll) * P(u).
263-
Float128 y_f128(fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, u));
264-
265-
Float128 p_f128 = asin_eval(y_f128, idx);
266-
Float128 r0_f128 = fputil::quick_mul(m_v, p_f128);
267-
Float128 r_f128 = fputil::quick_add(PI_OVER_TWO_F128, r0_f128);
268-
269-
if (xbits.is_neg())
270-
r_f128.sign = Sign::NEG;
271-
272-
return static_cast<double>(r_f128);
273-
#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
274-
}
14+
LLVM_LIBC_FUNCTION(double, asin, (double x)) { return math::asin(x); }
27515

27616
} // namespace LIBC_NAMESPACE_DECL

0 commit comments

Comments
 (0)