diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt index a2a30c2941d6b..3e3059570c184 100644 --- a/libclc/clc/lib/amdgpu/CMakeLists.txt +++ b/libclc/clc/lib/amdgpu/CMakeLists.txt @@ -5,6 +5,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES math/clc_exp2.cl math/clc_exp2_fast.cl math/clc_exp10.cl + math/clc_fmod.cl math/clc_frexp.cl math/clc_half_exp.cl math/clc_half_exp2.cl diff --git a/libclc/clc/lib/amdgpu/math/clc_fmod.cl b/libclc/clc/lib/amdgpu/math/clc_fmod.cl new file mode 100644 index 0000000000000..a05b75d968ce7 --- /dev/null +++ b/libclc/clc/lib/amdgpu/math/clc_fmod.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clc/math/clc_fmod.h" + +#define __CLC_FUNCTION __clc_fmod +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod +#define __CLC_BODY "clc/shared/binary_def.inc" + +#include "clc/math/gentype.inc" diff --git a/libclc/clc/lib/generic/math/clc_fmod.cl b/libclc/clc/lib/generic/math/clc_fmod.cl index 8add0cefd621f..7f60b403b53e6 100644 --- a/libclc/clc/lib/generic/math/clc_fmod.cl +++ b/libclc/clc/lib/generic/math/clc_fmod.cl @@ -6,10 +6,187 @@ // //===----------------------------------------------------------------------===// -#include "clc/internal/clc.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) { + int ux = __clc_as_int(x); + int ax = ux & EXSIGNBIT_SP32; + float xa = __clc_as_float(ax); + int sx = ux ^ ax; + int ex = ax >> EXPSHIFTBITS_SP32; + + int uy = __clc_as_int(y); + int ay = uy & EXSIGNBIT_SP32; + float ya = __clc_as_float(ay); + int ey = ay >> EXPSHIFTBITS_SP32; + + float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); + float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); + int c; + int k = ex - ey; + + while (k > 0) { + c = xr >= yr; + xr -= c ? yr : 0.0f; + xr += xr; + --k; + } + + c = xr >= yr; + xr -= c ? yr : 0.0f; + + int lt = ex < ey; + + xr = lt ? xa : xr; + yr = lt ? ya : yr; + + float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); + xr *= lt ? 1.0f : s; + + c = ax == ay; + xr = c ? 0.0f : xr; + + xr = __clc_as_float(sx ^ __clc_as_int(xr)); + + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | + ay == 0; + xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; + + return xr; +} + +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_fmod +#define __CLC_BODY +#include +#undef __CLC_FUNCTION + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { + ulong ux = __clc_as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong xsgn = ux ^ ax; + double dx = __clc_as_double(ax); + int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64); + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); + xexp1 = xexp < 1 ? xexp1 : xexp; + + ulong uy = __clc_as_ulong(y); + ulong ay = uy & ~SIGNBIT_DP64; + double dy = __clc_as_double(ay); + int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64); + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); + yexp1 = yexp < 1 ? yexp1 : yexp; + + // First assume |x| > |y| + + // Set ntimes to the number of times we need to do a + // partial remainder. If the exponent of x is an exact multiple + // of 53 larger than the exponent of y, and the mantissa of x is + // less than the mantissa of y, ntimes will be one too large + // but it doesn't matter - it just means that we'll go round + // the loop below one extra time. + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); + double w = __clc_ldexp(dy, ntimes * 53); + w = ntimes == 0 ? dy : w; + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; + + // Each time round the loop we compute a partial remainder. + // This is done by subtracting a large multiple of w + // from x each time, where w is a scaled up version of y. + // The subtraction must be performed exactly in quad + // precision, though the result at each stage can + // fit exactly in a double precision number. + int i; + double t, v, p, pp; + + for (i = 0; i < ntimes; i++) { + // Compute integral multiplier + t = __clc_trunc(dx / w); + + // Compute w * t in quad precision + p = w * t; + pp = __clc_fma(w, t, -p); + + // Subtract w * t from dx + v = dx - p; + dx = v + (((dx - v) - p) - pp); + + // If t was one too large, dx will be negative. Add back one w. + dx += dx < 0.0 ? w : 0.0; + + // Scale w down by 2^(-53) for the next iteration + w *= scale; + } + + // One more time + t = __clc_floor(dx / w); + + p = w * t; + pp = __clc_fma(w, t, -p); + v = dx - p; + dx = v + (((dx - v) - p) - pp); + i = dx < 0.0; + dx += i ? w : 0.0; + + // At this point, dx lies in the range [0,dy) + double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx)); + dx = __clc_as_double(ax); + + // Now handle |x| == |y| + int c = dx == dy; + t = __clc_as_double(xsgn); + ret = c ? t : ret; + + // Next, handle |x| < |y| + c = dx < dy; + ret = c ? x : ret; + + // We don't need anything special for |x| == 0 + + // |y| is 0 + c = dy == 0.0; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + // y is +-Inf, NaN + c = yexp > BIASEDEMAX_DP64; + t = y == y ? x : y; + ret = c ? t : ret; + + // x is +=Inf, NaN + c = xexp > BIASEDEMAX_DP64; + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; + + return ret; +} + +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_fmod +#define __CLC_BODY +#include +#undef __CLC_FUNCTION + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// Forward the half version of this builtin onto the float one +#define __CLC_HALF_ONLY #define __CLC_FUNCTION __clc_fmod -#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod -#define __CLC_BODY "clc/shared/binary_def.inc" +#define __CLC_BODY +#include -#include "clc/math/gentype.inc" +#endif