Skip to content

Commit

Permalink
[libc][math] Implement double precision log function correctly rounde…
Browse files Browse the repository at this point in the history
…d to all rounding modes.

Implement double precision log function correctly rounded to all
rounding modes.

See https://reviews.llvm.org/D150014 for a more detail description of the algorithm.

**Performance**

  - For `0.5 <= x <= 2`, the fast pass hitting rate is about 99.93%.

  - Reciprocal throughput from CORE-MATH's perf tool on Ryzen 5900X:
```
$ ./perf.sh log
GNU libc version: 2.35
GNU libc release: stable

-- CORE-MATH reciprocal throughput -- with FMA
[####################] 100 %
Ntrial = 20 ; Min = 17.465 + 0.596 clc/call; Median-Min = 0.602 clc/call; Max = 18.389 clc/call;

-- CORE-MATH reciprocal throughput -- without FMA (-march=x86-64-v2)
[####################] 100 %
Ntrial = 20 ; Min = 54.961 + 2.606 clc/call; Median-Min = 2.180 clc/call; Max = 59.583 clc/call;

-- System LIBC reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 12.608 + 0.276 clc/call; Median-Min = 0.359 clc/call; Max = 13.147 clc/call;

-- LIBC reciprocal throughput -- with FMA
[####################] 100 %
Ntrial = 20 ; Min = 20.952 + 0.468 clc/call; Median-Min = 0.602 clc/call; Max = 21.881 clc/call;

-- LIBC reciprocal throughput -- without FMA
[####################] 100 %
Ntrial = 20 ; Min = 18.569 + 0.552 clc/call; Median-Min = 0.601 clc/call; Max = 19.259 clc/call;

```
  - Latency from CORE-MATH's perf tool on Ryzen 5900X:
```
$ ./perf.sh log --latency
GNU libc version: 2.35
GNU libc release: stable

-- CORE-MATH latency -- with FMA
[####################] 100 %
Ntrial = 20 ; Min = 48.431 + 0.699 clc/call; Median-Min = 0.073 clc/call; Max = 51.269 clc/call;

-- CORE-MATH latency -- without FMA (-march=x86-64-v2)
[####################] 100 %
Ntrial = 20 ; Min = 64.865 + 3.235 clc/call; Median-Min = 3.475 clc/call; Max = 71.788 clc/call;

-- System LIBC latency --
[####################] 100 %
Ntrial = 20 ; Min = 42.151 + 2.090 clc/call; Median-Min = 2.270 clc/call; Max = 44.773 clc/call;

-- LIBC latency -- with FMA
[####################] 100 %
Ntrial = 20 ; Min = 35.266 + 0.479 clc/call; Median-Min = 0.373 clc/call; Max = 36.798 clc/call;

-- LIBC latency -- without FMA
[####################] 100 %
Ntrial = 20 ; Min = 48.518 + 0.484 clc/call; Median-Min = 0.500 clc/call; Max = 49.896 clc/call;
```
  - Accurate pass latency:
```
$ ./perf.sh log --latency --simple_stat
GNU libc version: 2.35
GNU libc release: stable

-- CORE-MATH latency -- with FMA
598.306

-- CORE-MATH latency -- without FMA (-march=x86-64-v2)
632.925

-- LIBC latency -- with FMA
455.632

-- LIBC latency -- without FMA
488.564
```

Reviewed By: zimmermann6

Differential Revision: https://reviews.llvm.org/D150131
  • Loading branch information
lntue committed May 23, 2023
1 parent cc6a6c4 commit a68bbf4
Show file tree
Hide file tree
Showing 15 changed files with 1,866 additions and 777 deletions.
1 change: 1 addition & 0 deletions libc/config/darwin/arm/entrypoints.txt
Expand Up @@ -178,6 +178,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.log10f
libc.src.math.log1pf
libc.src.math.log2f
libc.src.math.log
libc.src.math.logf
libc.src.math.logb
libc.src.math.logbf
Expand Down
1 change: 1 addition & 0 deletions libc/config/linux/aarch64/entrypoints.txt
Expand Up @@ -289,6 +289,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.log10f
libc.src.math.log1pf
libc.src.math.log2f
libc.src.math.log
libc.src.math.logf
libc.src.math.logb
libc.src.math.logbf
Expand Down
1 change: 1 addition & 0 deletions libc/config/linux/x86_64/entrypoints.txt
Expand Up @@ -294,6 +294,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.log10f
libc.src.math.log1pf
libc.src.math.log2f
libc.src.math.log
libc.src.math.logf
libc.src.math.logb
libc.src.math.logbf
Expand Down
1 change: 1 addition & 0 deletions libc/config/windows/entrypoints.txt
Expand Up @@ -171,6 +171,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.log10f
libc.src.math.log1pf
libc.src.math.log2f
libc.src.math.log
libc.src.math.logf
libc.src.math.logb
libc.src.math.logbf
Expand Down
1 change: 1 addition & 0 deletions libc/spec/stdc.td
Expand Up @@ -412,6 +412,7 @@ def StdC : StandardSpec<"stdc"> {

FunctionSpec<"log2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,

FunctionSpec<"log", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
FunctionSpec<"logf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,

FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
Expand Down
1 change: 1 addition & 0 deletions libc/src/math/CMakeLists.txt
Expand Up @@ -118,6 +118,7 @@ add_math_entrypoint_object(log1pf)

add_math_entrypoint_object(log2f)

add_math_entrypoint_object(log)
add_math_entrypoint_object(logf)

add_math_entrypoint_object(logb)
Expand Down
30 changes: 30 additions & 0 deletions libc/src/math/generic/CMakeLists.txt
Expand Up @@ -768,6 +768,15 @@ add_object_library(
libc.src.__support.number_pair
)

add_header_library(
log_range_reduction
HDRS
log_range_reduction.h
DEPENDS
.common_constants
libc.src.__support.FPUtil.dyadic_float
)

add_entrypoint_object(
log10
SRCS
Expand All @@ -776,6 +785,7 @@ add_entrypoint_object(
../log10.h
DEPENDS
.common_constants
.log_range_reduction
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.multiply_add
Expand Down Expand Up @@ -840,6 +850,26 @@ add_entrypoint_object(
-O3
)

add_entrypoint_object(
log
SRCS
log.cpp
HDRS
../log.h
DEPENDS
.common_constants
.log_range_reduction
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.multiply_add
libc.src.__support.FPUtil.polyeval
libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.dyadic_float
libc.src.__support.macros.optimization
COMPILE_OPTIONS
-O3
)

add_entrypoint_object(
logf
SRCS
Expand Down
72 changes: 72 additions & 0 deletions libc/src/math/generic/common_constants.cpp
Expand Up @@ -365,6 +365,78 @@ alignas(64) const NumberPair<double> LOG_R_DD[128] = {
{0.0, 0.0},
};

// Logarithm range reduction - Step 2:
// r(k) = 2^-16 round(2^16 / (1 + k*2^-14)) for k = -2^6 .. 2^7.
// Output range:
// [-0x1.3ffcp-15, 0x1.3e3dp-15]
// We store S2[i] = 2^16 (r(i - 2^6) - 1).
alignas(64) const int S2[193] = {
0x101, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1,
0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xbd,
0xb9, 0xb4, 0xb0, 0xac, 0xa8, 0xa4, 0xa0, 0x9c, 0x98,
0x94, 0x90, 0x8c, 0x88, 0x84, 0x80, 0x7c, 0x78, 0x74,
0x70, 0x6c, 0x68, 0x64, 0x60, 0x5c, 0x58, 0x54, 0x50,
0x4c, 0x48, 0x44, 0x40, 0x3c, 0x38, 0x34, 0x30, 0x2c,
0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0xc, 0x8,
0x4, 0x0, -0x4, -0x8, -0xc, -0x10, -0x14, -0x18, -0x1c,
-0x20, -0x24, -0x28, -0x2c, -0x30, -0x34, -0x38, -0x3c, -0x40,
-0x44, -0x48, -0x4c, -0x50, -0x54, -0x58, -0x5c, -0x60, -0x64,
-0x68, -0x6c, -0x70, -0x74, -0x78, -0x7c, -0x80, -0x84, -0x88,
-0x8c, -0x90, -0x94, -0x98, -0x9c, -0xa0, -0xa4, -0xa8, -0xac,
-0xb0, -0xb4, -0xb7, -0xbb, -0xbf, -0xc3, -0xc7, -0xcb, -0xcf,
-0xd3, -0xd7, -0xdb, -0xdf, -0xe3, -0xe7, -0xeb, -0xef, -0xf3,
-0xf7, -0xfb, -0xff, -0x103, -0x107, -0x10b, -0x10f, -0x113, -0x117,
-0x11b, -0x11f, -0x123, -0x127, -0x12b, -0x12f, -0x133, -0x137, -0x13a,
-0x13e, -0x142, -0x146, -0x14a, -0x14e, -0x152, -0x156, -0x15a, -0x15e,
-0x162, -0x166, -0x16a, -0x16e, -0x172, -0x176, -0x17a, -0x17e, -0x182,
-0x186, -0x18a, -0x18e, -0x192, -0x195, -0x199, -0x19d, -0x1a1, -0x1a5,
-0x1a9, -0x1ad, -0x1b1, -0x1b5, -0x1b9, -0x1bd, -0x1c1, -0x1c5, -0x1c9,
-0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec,
-0x1f0, -0x1f4, -0x1f8, -0x1fc};

// Logarithm range reduction - Step 3:
// r(k) = 2^-21 round(2^21 / (1 + k*2^-21)) for k = -80 .. 80.
// Output range:
// [-0x1.01928p-22 , 0x1p-22]
// We store S[i] = 2^21 (r(i - 80) - 1).
alignas(64) const int S3[161] = {
0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46,
0x45, 0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b,
0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25,
0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a,
0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0xf,
0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
0x3, 0x2, 0x1, 0x0, -0x1, -0x2, -0x3, -0x4, -0x5, -0x6, -0x7,
-0x8, -0x9, -0xa, -0xb, -0xc, -0xd, -0xe, -0xf, -0x10, -0x11, -0x12,
-0x13, -0x14, -0x15, -0x16, -0x17, -0x18, -0x19, -0x1a, -0x1b, -0x1c, -0x1d,
-0x1e, -0x1f, -0x20, -0x21, -0x22, -0x23, -0x24, -0x25, -0x26, -0x27, -0x28,
-0x29, -0x2a, -0x2b, -0x2c, -0x2d, -0x2e, -0x2f, -0x30, -0x31, -0x32, -0x33,
-0x34, -0x35, -0x36, -0x37, -0x38, -0x39, -0x3a, -0x3b, -0x3c, -0x3d, -0x3e,
-0x3f, -0x40, -0x41, -0x42, -0x43, -0x44, -0x45, -0x46, -0x47, -0x48, -0x49,
-0x4a, -0x4b, -0x4c, -0x4d, -0x4e, -0x4f, -0x50,
};

// Logarithm range reduction - Step 4
// r(k) = 2^-28 round(2^28 / (1 + k*2^-28)) for k = -65 .. 64.
// Output range:
// [-0x1.0002143p-29 , 0x1p-29]
// We store S[i] = 2^28 (r(i - 65) - 1).
alignas(64) const int S4[130] = {
0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37,
0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c,
0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21,
0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16,
0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0xf, 0xe, 0xd, 0xc, 0xb,
0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
-0x1, -0x2, -0x3, -0x4, -0x5, -0x6, -0x7, -0x8, -0x9, -0xa, -0xb,
-0xc, -0xd, -0xe, -0xf, -0x10, -0x11, -0x12, -0x13, -0x14, -0x15, -0x16,
-0x17, -0x18, -0x19, -0x1a, -0x1b, -0x1c, -0x1d, -0x1e, -0x1f, -0x20, -0x21,
-0x22, -0x23, -0x24, -0x25, -0x26, -0x27, -0x28, -0x29, -0x2a, -0x2b, -0x2c,
-0x2d, -0x2e, -0x2f, -0x30, -0x31, -0x32, -0x33, -0x34, -0x35, -0x36, -0x37,
-0x38, -0x39, -0x3a, -0x3b, -0x3c, -0x3d, -0x3e, -0x3f, -0x40,
};

// Lookup table for exp(m) with m = -104, ..., 89.
// -104 = floor(log(single precision's min denormal))
// 89 = ceil(log(single precision's max normal))
Expand Down
5 changes: 5 additions & 0 deletions libc/src/math/generic/common_constants.h
Expand Up @@ -39,6 +39,11 @@ constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
-0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
-0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};

// Logarithm Range Reduction - Step 2, 3, and 4.
extern const int S2[193];
extern const int S3[161];
extern const int S4[130];

// log(2) generated by Sollya with:
// > a = 2^-43 * nearestint(2^43*log(2));
// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
Expand Down

0 comments on commit a68bbf4

Please sign in to comment.